Index: llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -160,12 +160,23 @@ // Build the instruction used to repair, then clone it at the right // places. Avoiding buildCopy bypasses the check that Src and Dst have the - // same types because the type is a placeholder when this function is called. - MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY) - .addDef(Dst) - .addUse(Src); - LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst) - << '\n'); + // same types because the type is a placeholder when this function is + // called. Constants can be repaired by simply rematerializing them. + MachineInstr *SrcMI = MRI->getUniqueVRegDef(Src); + if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT && + MRI->hasOneNonDBGUse(SrcMI->getOperand(0).getReg())) { + MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::G_CONSTANT) + .addDef(Dst) + .addCImm(SrcMI->getOperand(1).getCImm()); + LLVM_DEBUG(dbgs() << "Const: " << *SrcMI->getOperand(1).getCImm() + << " to: " << printReg(Dst) << '\n'); + } else { + MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY) + .addDef(Dst) + .addUse(Src); + LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst) + << '\n'); + } } else { // TODO: Support with G_IMPLICIT_DEF + G_INSERT sequence or G_EXTRACT // sequence. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -156,7 +156,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 64, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -182,10 +182,10 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 -; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 4, v0 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -8,7 +8,6 @@ ; GFX6-LABEL: v_ashr_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -123,7 +122,6 @@ ; GCN-LABEL: v_ashr_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -132,7 +130,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX10-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -575,7 +572,6 @@ ; GFX6-LABEL: v_ashr_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -667,7 +663,6 @@ define amdgpu_ps half @ashr_i16_sv(i16 inreg %value, i16 %amount) { ; GFX6-LABEL: ashr_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -2216,9 +2216,9 @@ ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v6, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2229,15 +2229,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v15 +; GFX9-NEXT: v_and_or_b32 v2, v2, s3, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_or3_b32 v0, v0, v12, v7 ; GFX9-NEXT: v_or3_b32 v1, v1, v14, v8 @@ -2268,32 +2268,32 @@ ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v10 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 @@ -2371,37 +2371,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_mov_b32_e32 v6, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 -; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v12, v7 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 -; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX10-NEXT: v_or3_b32 v2, v2, v14, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v3, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 ; GFX10-NEXT: v_or3_b32 v1, v4, v3, v5 @@ -2441,15 +2440,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v17 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v12 @@ -2479,38 +2478,38 @@ ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; GFX8-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 ; GFX8-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v5, v17 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v5, v6, v8 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v6, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2583,43 +2582,42 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 ; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, 0xff -; GFX10-NEXT: v_mov_b32_e32 v7, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, v5, s6, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4078,11 +4078,8 @@ ; MOVREL-LABEL: v_extract_v64i32_32: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] @@ -4111,11 +4108,8 @@ ; MOVREL-LABEL: v_extract_v64i32_33: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -523,14 +523,14 @@ ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, 4 -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; @@ -540,15 +540,15 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 -; GFX10-NEXT: s_add_i32 s0, s0, 4 -; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword off, v1, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm bb: @@ -567,15 +567,15 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 13 -; GFX9-NEXT: s_movk_i32 s0, 0x3e80 -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi -; GFX9-NEXT: scratch_store_dword off, v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e80 +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 +; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: scratch_load_dword v0, off, s0 glc +; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -583,16 +583,16 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 13 -; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e80 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x3e80 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 -; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo -; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v1, 13 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: scratch_store_dword off, v1, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_store_dword off, v1, s0 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc +; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1840,11 +1840,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffffe8 ; GFX10-NEXT: s_movk_i32 s9, 0xff ; GFX10-NEXT: s_lshr_b32 s10, s1, 8 -; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 ; GFX10-NEXT: s_and_b32 s1, s1, s9 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 @@ -1852,39 +1853,39 @@ ; GFX10-NEXT: s_and_b32 s6, s6, s9 ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, s9 ; GFX10-NEXT: s_lshl_b32 s6, s6, s11 -; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_and_b32 s6, s7, s9 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v1 ; GFX10-NEXT: s_and_b32 s7, s10, s9 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 ; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_lshr_b32 s12, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, s9 ; GFX10-NEXT: s_lshl_b32 s8, s8, s11 ; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_and_b32 s8, s10, s9 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 ; GFX10-NEXT: s_lshl_b32 s5, s5, s11 +; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: s_and_b32 s8, s13, s9 ; GFX10-NEXT: s_or_b32 s5, s12, s5 ; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 @@ -2141,23 +2142,24 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffffffe8 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v8, v6 +; GFX10-NEXT: v_mul_lo_u32 v8, v8, v7 +; GFX10-NEXT: v_mul_hi_u32 v9, v6, v9 +; GFX10-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v8 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -1056,11 +1056,12 @@ ; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 @@ -1071,7 +1072,6 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 @@ -1107,11 +1107,11 @@ ; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 ; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 ; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 @@ -1122,7 +1122,6 @@ ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 ; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 @@ -1136,42 +1135,41 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v7 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v10 ; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 ; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v13, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 +; GFX10-NEXT: v_lshlrev_b16 v5, v12, v5 +; GFX10-NEXT: v_lshrrev_b16 v7, v11, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 @@ -1849,111 +1847,112 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffffe8 +; GFX10-NEXT: s_movk_i32 s8, 0xff ; GFX10-NEXT: s_lshr_b32 s12, s4, 8 -; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s13, s4, 16 -; GFX10-NEXT: s_and_b32 s12, s12, s9 +; GFX10-NEXT: s_and_b32 s12, s12, s8 ; GFX10-NEXT: s_lshr_b32 s14, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 +; GFX10-NEXT: s_and_b32 s4, s4, s8 ; GFX10-NEXT: s_lshl_b32 s12, s12, s10 -; GFX10-NEXT: s_and_b32 s13, s13, s9 +; GFX10-NEXT: s_and_b32 s13, s13, s8 ; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 ; GFX10-NEXT: s_lshr_b32 s15, s5, 8 ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s5, s5, s9 +; GFX10-NEXT: s_lshl_b32 s12, s12, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s8 ; GFX10-NEXT: s_or_b32 s4, s4, s12 +; GFX10-NEXT: v_mul_lo_u32 v3, v2, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v1 ; GFX10-NEXT: s_lshl_b32 s5, s5, s10 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s12, s15, s9 +; GFX10-NEXT: s_and_b32 s12, s15, s8 ; GFX10-NEXT: s_or_b32 s5, s14, s5 ; GFX10-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: s_lshr_b32 s11, s1, 8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX10-NEXT: s_or_b32 s5, s5, s12 -; GFX10-NEXT: s_and_b32 s1, s1, s9 +; GFX10-NEXT: s_lshr_b32 s11, s1, 8 +; GFX10-NEXT: s_and_b32 s1, s1, s8 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s9, s0, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s9 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: s_and_b32 s6, s6, s8 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, s8 ; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s9, s9, s8 ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s11, s9 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_and_b32 s6, s7, s8 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_and_b32 s7, s11, s8 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 ; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 ; GFX10-NEXT: s_lshr_b32 s12, s3, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s8, s11, s9 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_or_b32 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s9, s11, s8 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s3, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: s_bfe_u32 s4, s9, 0x100000 +; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_and_b32 s5, s12, s9 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_mov_b32 s4, 0xffffff ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_and_b32 s5, s12, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s3, s13, s3 ; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s5, s6, 17 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s6, 17 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: v_and_b32_e32 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX10-NEXT: s_or_b32 s0, s5, s0 +; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX10-NEXT: s_or_b32 s0, s2, s1 @@ -1961,10 +1960,10 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_e32 v3, s8, v1 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, s8, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 @@ -2156,11 +2155,12 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0xffffffe8 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 @@ -2169,12 +2169,12 @@ ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 -; GFX10-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 -; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 -; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 -; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 -; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_lo_u32 v9, v8, v6 +; GFX10-NEXT: v_mul_lo_u32 v8, v8, v7 +; GFX10-NEXT: v_mul_hi_u32 v9, v6, v9 +; GFX10-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v8 ; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -18,8 +18,8 @@ ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -36,8 +36,8 @@ ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -62,17 +62,17 @@ ; GFX10-LABEL: insertelement_s_v2i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_and_b32 s3, s4, s2 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 +; GFX10-NEXT: s_and_b32 s2, s5, 1 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 4 +; GFX10-NEXT: s_and_b32 s3, s4, s1 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -85,37 +85,37 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 +; GFX9-NEXT: v_and_or_b32 v2, v1, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: s_and_b32 s1, s3, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: s_and_b32 s2, s2, s0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -139,18 +139,18 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_s_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[0:1], off -; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: global_load_dword v1, v[0:1], off +; GFX10-NEXT: s_and_b32 s1, s3, 1 +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 +; GFX10-NEXT: v_and_or_b32 v2, v1, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr @@ -166,14 +166,14 @@ ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -185,12 +185,12 @@ ; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -214,17 +214,17 @@ ; GFX10-LABEL: insertelement_s_v2i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_and_b32 s2, s4, 1 +; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, s2, s0 +; GFX10-NEXT: global_store_dword v[1:2], v0, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx @@ -240,13 +240,13 @@ ; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_and_b32 s2, s4, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -257,14 +257,14 @@ ; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_and_b32 s2, s4, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -295,11 +295,11 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 ; GFX10-NEXT: s_and_b32 s1, s4, s1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX10-NEXT: v_and_or_b32 v2, s0, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr @@ -315,13 +315,13 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -331,14 +331,14 @@ ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -367,12 +367,12 @@ ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 +; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr @@ -384,36 +384,36 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -437,18 +437,18 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v1, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -460,35 +460,35 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: s_and_b32 s0, s2, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: s_and_b32 s1, s2, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -512,17 +512,17 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_v_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v1, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 1 ; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_lshl_b32 s0, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, s0, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -534,34 +534,34 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -585,17 +585,17 @@ ; ; GFX10-LABEL: insertelement_v_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v1, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v0, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_and_or_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr @@ -674,15 +674,15 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v3, v4, s0, v3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -699,14 +699,14 @@ ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -748,12 +748,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_and_or_b32 v3, v2, s0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr @@ -777,15 +777,15 @@ ; GFX9-NEXT: s_lshl_b32 s5, s5, s4 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -804,14 +804,14 @@ ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 -; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 +; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -857,12 +857,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s5, s5, s4 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX10-NEXT: v_lshl_or_b32 v3, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -888,14 +888,14 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 +; GFX9-NEXT: v_and_or_b32 v3, v1, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -916,14 +916,14 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -959,25 +959,25 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v0 ; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: s_and_b32 s3, s4, s2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_and_or_b32 v4, v5, v2, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -1002,14 +1002,14 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 +; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1029,14 +1029,14 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1085,11 +1085,11 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 +; GFX10-NEXT: v_and_or_b32 v3, v5, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr @@ -1103,20 +1103,20 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: s_and_b32 s1, s2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v5, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -1126,21 +1126,21 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1185,11 +1185,11 @@ ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_and_or_b32 v3, v4, v3, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1211,13 +1211,13 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v4, s0, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1228,20 +1228,20 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_lshl_b32 s2, s2, 4 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -1282,12 +1282,12 @@ ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_and_or_b32 v3, v3, s0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1300,20 +1300,20 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off @@ -1322,20 +1322,20 @@ ; GFX8-LABEL: insertelement_v_v4i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -1370,21 +1370,21 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v4, v5, v4, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -1423,10 +1423,10 @@ ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1459,10 +1459,10 @@ ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1510,7 +1510,7 @@ ; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s7, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 @@ -1558,20 +1558,20 @@ ; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1590,18 +1590,18 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_or_b32_e32 v5, s5, v5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1655,14 +1655,14 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_and_or_b32 v5, v4, s3, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr @@ -1690,21 +1690,21 @@ ; GFX9-NEXT: s_lshl_b32 s7, s7, s4 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1727,20 +1727,20 @@ ; GFX8-NEXT: s_lshl_b32 s4, s7, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 -; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1803,16 +1803,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_lshl_b32 s7, s7, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 +; GFX10-NEXT: v_lshl_or_b32 v5, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -1844,18 +1844,18 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -1882,18 +1882,18 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -1940,17 +1940,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v0 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: s_and_b32 s1, s4, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v5 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo @@ -1960,13 +1960,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_and_or_b32 v6, v7, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -1997,18 +1997,18 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -2034,18 +2034,18 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -2092,16 +2092,16 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v1 ; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v5 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo @@ -2111,13 +2111,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: v_and_or_b32 v6, v7, v6, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr @@ -2143,16 +2143,16 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v8, v8, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -2172,17 +2172,17 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX8-NEXT: v_or_b32_e32 v8, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v8, v7 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -2237,13 +2237,13 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 +; GFX10-NEXT: v_and_or_b32 v8, v2, v7, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr @@ -2267,17 +2267,17 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0 +; GFX9-NEXT: v_and_or_b32 v8, v1, s5, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -2296,18 +2296,18 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v8, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v8, v7 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -2357,17 +2357,17 @@ ; GFX10-NEXT: s_lshl_b32 s2, s4, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v8, v0, s2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v8, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr @@ -2392,7 +2392,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -2420,7 +2420,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] @@ -2481,7 +2481,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 @@ -2544,10 +2544,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_mov_b64 s[0:1], 16 @@ -2555,7 +2555,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2603,10 +2602,10 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: s_cselect_b32 s7, s16, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 16 @@ -2684,8 +2683,7 @@ ; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s0, s9, s8 ; GFX10-NEXT: s_cmp_eq_u32 s7, 2 @@ -2733,7 +2731,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, s7 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2774,19 +2772,18 @@ ; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: @@ -2823,16 +2820,16 @@ ; GFX8-NEXT: v_or_b32_e32 v10, s13, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -2904,11 +2901,10 @@ ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_lshl_b32 s8, s8, s3 ; GFX10-NEXT: s_lshl_b32 s3, s9, s3 ; GFX10-NEXT: s_not_b32 s8, s8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v11, v10 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -2930,7 +2926,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2991,11 +2987,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: @@ -3050,7 +3045,7 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3122,9 +3117,9 @@ ; GFX10-NEXT: s_lshr_b32 s0, s4, 1 ; GFX10-NEXT: s_mov_b32 s3, 0xffff ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 -; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v9, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s1, s9, s8 ; GFX10-NEXT: s_cmp_eq_u32 s0, 2 @@ -3151,9 +3146,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 -; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_lshl_or_b32 v10, v9, s2, s1 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo @@ -3171,7 +3165,7 @@ ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3224,6 +3218,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s22 ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -3232,12 +3227,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: @@ -3285,6 +3278,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, s22 ; GFX8-NEXT: v_mov_b32_e32 v7, s23 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -3293,8 +3287,7 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3368,7 +3361,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_and_b32 s6, s4, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 @@ -3379,29 +3372,28 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 +; GFX10-NEXT: v_lshlrev_b32_e64 v9, v0, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v1, s15, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_and_or_b32 v11, v12, v11, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 @@ -3412,7 +3404,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3464,6 +3456,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s18 ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -3472,12 +3465,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: @@ -3524,6 +3515,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, s18 ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -3532,8 +3524,7 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3607,7 +3598,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 @@ -3618,8 +3609,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo @@ -3628,18 +3619,17 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v2, s15, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_and_or_b32 v11, v12, v11, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 @@ -3650,7 +3640,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3689,20 +3679,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: @@ -3738,17 +3727,17 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 ; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3810,7 +3799,6 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: s_and_b32 s6, s2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -3823,7 +3811,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v11 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 @@ -3833,9 +3822,8 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_and_or_b32 v13, v1, v12, v2 +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 @@ -3846,7 +3834,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3884,20 +3872,19 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: @@ -3933,17 +3920,17 @@ ; GFX8-NEXT: v_and_b32_e32 v1, s13, v1 ; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -4015,11 +4002,10 @@ ; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_lshl_b32 s7, s8, s7 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 -; GFX10-NEXT: s_not_b32 s7, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: s_not_b32 s7, s7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 @@ -4040,7 +4026,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4079,19 +4065,18 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: @@ -4127,16 +4112,16 @@ ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v9, v8 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -4198,7 +4183,6 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -4212,7 +4196,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v13, v12 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 @@ -4233,7 +4217,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -8,19 +8,18 @@ ; GFX9-LABEL: insertelement_s_v2i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -28,19 +27,19 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: flat_load_ushort v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -72,16 +71,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0 -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr @@ -93,37 +91,37 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i8_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_load_ushort v1, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: flat_load_ushort v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -157,12 +155,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr @@ -175,38 +173,37 @@ ; GFX9-LABEL: insertelement_s_v2i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: flat_load_ushort v1, v[1:2] +; GFX8-NEXT: flat_load_ushort v2, v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: flat_store_short v[1:2], v0 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_v_s: @@ -236,17 +233,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -258,40 +254,39 @@ ; GFX9-LABEL: insertelement_s_v2i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: flat_load_ushort v1, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: flat_load_ushort v2, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: flat_store_short v[1:2], v0 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_s_v: @@ -322,17 +317,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s4, vcc_lo +; GFX10-NEXT: v_and_b32_sdwa v2, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -344,38 +338,37 @@ ; GFX9-LABEL: insertelement_s_v2i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX9-NEXT: global_load_ushort v3, v2, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: global_store_short v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: flat_load_ushort v3, v[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: flat_store_short v[2:3], v0 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_v_v: @@ -405,17 +398,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX10-NEXT: global_load_ushort v3, v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: v_and_b32_sdwa v1, v4, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: global_store_short v[2:3], v0, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx @@ -426,37 +418,37 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i8_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_load_ushort v1, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: flat_load_ushort v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -491,11 +483,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -507,35 +499,35 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i8_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -569,11 +561,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -585,35 +577,35 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i8_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[0:1], off ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -647,11 +639,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr @@ -720,7 +712,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v4i8_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_and_b32 s3, s3, 3 @@ -730,33 +722,31 @@ ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 @@ -764,27 +754,27 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, 8 -; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -832,32 +822,30 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_mov_b32 s1, 16 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 3 +; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s3, s3, 3 +; GFX10-NEXT: s_lshl_b32 s5, s4, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, s3 +; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX10-NEXT: s_and_b32 s0, s3, 3 -; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 -; GFX10-NEXT: s_lshl_b32 s3, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s3, s2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v0, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr @@ -894,11 +882,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -930,11 +918,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1002,11 +990,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v1, v0, s2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1043,11 +1031,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1080,11 +1068,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1154,11 +1142,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v1, v0, s1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1194,11 +1182,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-NEXT: v_or3_b32 v2, v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1230,11 +1218,11 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1303,11 +1291,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v1, v0, s1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr @@ -1319,7 +1307,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v4i8_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 @@ -1329,60 +1317,59 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1432,8 +1419,10 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_and_b32 s2, s2, s1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 @@ -1441,21 +1430,18 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: s_and_b32 s0, s2, s1 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v5, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v0, s1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1467,7 +1453,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v4i8_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 @@ -1476,60 +1462,58 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: s_and_b32 s1, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1578,30 +1562,28 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 +; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_and_b32 s2, s2, 3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 3 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s1, s0, s1 -; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: v_or3_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v0, s3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1613,7 +1595,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v4i8_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX9-NEXT: s_mov_b32 s1, 16 @@ -1622,60 +1604,58 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s2 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX9-NEXT: v_or3_b32 v2, v1, v5, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1686,36 +1666,35 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_movk_i32 s2, 0xff -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v1 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s2, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s2, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1726,6 +1705,7 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_movk_i32 s1, 0xff +; GFX10-NEXT: s_mov_b32 s2, 16 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1733,22 +1713,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v3 -; GFX10-NEXT: v_mov_b32_e32 v3, 8 ; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_or3_b32 v2, v1, v3, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr @@ -1763,8 +1740,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_mov_b32 s8, 0x80008 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s9, s0, s8 ; GFX9-NEXT: s_and_b32 s7, s0, s6 @@ -1821,9 +1798,9 @@ ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s3, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i8_s_s: @@ -1831,8 +1808,8 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_mov_b32 s8, 0x80008 ; GFX8-NEXT: s_movk_i32 s6, 0xff -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s9, s0, s8 ; GFX8-NEXT: s_and_b32 s7, s0, s6 @@ -1889,9 +1866,9 @@ ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i8_s_s: @@ -1971,7 +1948,7 @@ ; GFX10-NEXT: s_mov_b32 s6, 0x80010 ; GFX10-NEXT: s_lshr_b32 s7, s5, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s11, s0, s3 ; GFX10-NEXT: s_bfe_u32 s13, s1, s3 @@ -2051,49 +2028,49 @@ ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v5, v6, s3, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: s_lshr_b32 s1, s3, 2 ; GFX8-NEXT: s_and_b32 s3, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff @@ -2103,45 +2080,45 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_or_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2253,12 +2230,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: v_or3_b32 v3, v1, v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr @@ -2321,12 +2298,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, s7, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2387,10 +2364,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2515,12 +2492,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 -; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v1, v7, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2584,12 +2561,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, s7, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2652,10 +2629,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -2783,12 +2760,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 -; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v1, v7, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2851,12 +2828,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 +; GFX9-NEXT: v_and_or_b32 v2, v1, s6, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_or3_b32 v1, v2, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2918,10 +2895,10 @@ ; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, v2 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -3048,12 +3025,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 -; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v1, v7, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -3069,104 +3046,104 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v7, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v8, v2, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v9, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -3278,12 +3255,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v5, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v8, v3, v6 +; GFX10-NEXT: v_or3_b32 v3, v1, v3, v6 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3306,40 +3283,40 @@ ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, v3 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -3348,54 +3325,54 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_lshr_b32 s1, s2, 2 ; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, v3 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -3506,12 +3483,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 +; GFX10-NEXT: v_or3_b32 v3, v1, v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3527,103 +3504,103 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 8 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v1, v1, s2, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 -; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v10, v3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v9, v3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v8, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 2, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX8-NEXT: v_mov_b32_e32 v6, 0xff -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v6 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -3736,12 +3713,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v7 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: v_or3_b32 v3, v1, v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3757,7 +3734,7 @@ ; GFX9-NEXT: s_mov_b32 s12, 0x80008 ; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s13, s0, s12 ; GFX9-NEXT: s_and_b32 s11, s0, s10 @@ -3875,7 +3852,7 @@ ; GFX8-NEXT: s_mov_b32 s12, 0x80008 ; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s13, s0, s12 ; GFX8-NEXT: s_and_b32 s11, s0, s10 @@ -4113,7 +4090,7 @@ ; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_mov_b32 s8, 0x80010 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s14, s0, s7 ; GFX10-NEXT: s_lshr_b32 s9, s0, 24 @@ -4236,91 +4213,91 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 ; GFX9-NEXT: s_lshr_b32 s4, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v6, 16 ; GFX9-NEXT: s_and_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s3, s6, s3 ; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v3, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 ; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 +; GFX9-NEXT: v_and_or_b32 v12, v3, s6, v18 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 -; GFX9-NEXT: v_or3_b32 v3, v3, v7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_or3_b32 v3, v12, v3, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v7, v8, s5, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v0, v12, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v14, v8 +; GFX9-NEXT: v_or3_b32 v2, v2, v16, v9 +; GFX9-NEXT: v_or3_b32 v3, v3, v6, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; -; GFX8-LABEL: insertelement_v_v16i8_s_s: -; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v6, 8 -; GFX8-NEXT: v_mov_b32_e32 v7, 16 -; GFX8-NEXT: v_mov_b32_e32 v8, 8 -; GFX8-NEXT: v_mov_b32_e32 v9, 16 +; GFX8-LABEL: insertelement_v_v16i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 ; GFX8-NEXT: s_and_b32 s1, s3, 3 ; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: s_and_b32 s2, s2, s0 @@ -4331,74 +4308,74 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v11 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v19 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_or_b32_e32 v5, s5, v5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v9 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -4518,13 +4495,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_mov_b32_e32 v5, 16 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_lshr_b32 s5, s3, 2 -; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_and_b32 s2, s2, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -4533,13 +4510,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 @@ -4577,7 +4554,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 @@ -4587,11 +4564,11 @@ ; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 -; GFX10-NEXT: v_or3_b32 v3, v3, v16, v9 +; GFX10-NEXT: v_or3_b32 v3, v3, v5, v9 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr @@ -4687,23 +4664,23 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s11, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s11, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_and_or_b32 v4, v3, s11, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -4796,24 +4773,24 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -4943,7 +4920,6 @@ ; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_bfe_u32 s13, s0, s6 @@ -5007,37 +4983,38 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s5, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v10 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 +; GFX10-NEXT: v_and_or_b32 v13, v3, s5, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v3, v13, v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5134,23 +5111,23 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s12, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_and_or_b32 v4, v3, s12, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -5245,24 +5222,24 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5395,7 +5372,6 @@ ; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 @@ -5463,31 +5439,32 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s5, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v13, v3, s5, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v10 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 +; GFX10-NEXT: v_or3_b32 v3, v13, v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5583,23 +5560,23 @@ ; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v4, v2, s10, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX9-NEXT: v_or3_b32 v2, v4, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_and_or_b32 v4, v3, s10, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX9-NEXT: v_or3_b32 v3, v4, v3, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -5693,24 +5670,24 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -5843,7 +5820,6 @@ ; GFX10-NEXT: s_mov_b32 s9, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 @@ -5910,31 +5886,32 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, v0, s8, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 +; GFX10-NEXT: v_and_or_b32 v13, v3, s8, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 -; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or3_b32 v0, v5, v0, v10 +; GFX10-NEXT: v_or3_b32 v1, v9, v1, v6 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 +; GFX10-NEXT: v_or3_b32 v3, v13, v3, v8 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr @@ -5953,75 +5930,75 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v12 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 ; GFX9-NEXT: s_and_b32 s0, s2, s6 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX9-NEXT: v_and_or_b32 v12, v6, s6, v18 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 +; GFX9-NEXT: v_lshlrev_b32_e64 v16, v2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v5, v5, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v14 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_or3_b32 v6, v12, v6, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v14 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v8, v2, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX9-NEXT: v_and_or_b32 v9, v2, s6, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v1, v12, v3 +; GFX9-NEXT: v_or3_b32 v1, v4, v14, v6 +; GFX9-NEXT: v_or3_b32 v2, v5, v16, v8 +; GFX9-NEXT: v_or3_b32 v3, v9, v17, v10 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -6030,88 +6007,88 @@ ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: s_and_b32 s1, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v11 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX8-NEXT: v_lshlrev_b32_e64 v16, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v14 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v14 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 +; GFX8-NEXT: v_or_b32_e32 v10, v0, v16 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, v7 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6231,12 +6208,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 -; GFX10-NEXT: v_mov_b32_e32 v7, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6247,13 +6224,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 @@ -6292,19 +6269,19 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v10, v4, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v12, v0, s3, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v11, v5 ; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 -; GFX10-NEXT: v_or3_b32 v2, v12, v15, v8 -; GFX10-NEXT: v_or3_b32 v3, v14, v7, v9 +; GFX10-NEXT: v_or3_b32 v2, v10, v15, v8 +; GFX10-NEXT: v_or3_b32 v3, v12, v7, v9 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -6321,9 +6298,9 @@ ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 ; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -6331,66 +6308,66 @@ ; GFX9-NEXT: s_not_b32 s5, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v12 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX9-NEXT: v_and_or_b32 v12, v6, s6, v18 ; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v5, v5, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 +; GFX9-NEXT: v_or3_b32 v6, v12, v6, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v8, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v11 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX9-NEXT: v_and_or_b32 v9, v2, s6, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v0, v1, v12, v3 +; GFX9-NEXT: v_or3_b32 v1, v4, v14, v6 +; GFX9-NEXT: v_or3_b32 v2, v5, v16, v8 +; GFX9-NEXT: v_or3_b32 v3, v9, v17, v10 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -6401,11 +6378,11 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v10, s1 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 @@ -6413,74 +6390,74 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v11 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc ; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[2:3] ; GFX8-NEXT: v_and_b32_e32 v5, s5, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 +; GFX8-NEXT: v_or_b32_e32 v10, v0, v16 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, v7 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6600,11 +6577,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s2, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 @@ -6614,13 +6591,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v5 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 @@ -6663,16 +6640,16 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 +; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 +; GFX10-NEXT: v_and_or_b32 v10, v2, s3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v0, v1, v11, v3 ; GFX10-NEXT: v_or3_b32 v1, v6, v13, v7 -; GFX10-NEXT: v_or3_b32 v2, v10, v15, v8 -; GFX10-NEXT: v_or3_b32 v3, v12, v16, v9 +; GFX10-NEXT: v_or3_b32 v2, v5, v15, v8 +; GFX10-NEXT: v_or3_b32 v3, v10, v16, v9 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr @@ -6686,9 +6663,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NEXT: v_mov_b32_e32 v8, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6696,100 +6673,99 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v13, v4, s2, v13 -; GFX9-NEXT: v_and_or_b32 v15, v5, s2, v15 -; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 -; GFX9-NEXT: v_and_or_b32 v17, v7, v0, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX9-NEXT: v_and_or_b32 v5, v5, s2, v15 +; GFX9-NEXT: v_and_or_b32 v6, v6, s2, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 2, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v9, v13, v14, v9 -; GFX9-NEXT: v_or3_b32 v10, v15, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX9-NEXT: v_or3_b32 v5, v5, v16, v10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: v_and_or_b32 v15, v7, v0, v19 ; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v6, v6, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v9, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v0 -; GFX9-NEXT: v_or3_b32 v7, v17, v7, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX9-NEXT: v_or3_b32 v7, v15, v7, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v11, v3, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v7, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, v3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v15 +; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v16 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v17 +; GFX9-NEXT: v_and_or_b32 v12, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 -; GFX9-NEXT: v_or3_b32 v1, v9, v16, v10 -; GFX9-NEXT: v_or3_b32 v2, v6, v18, v11 -; GFX9-NEXT: v_or3_b32 v3, v13, v8, v12 +; GFX9-NEXT: v_or3_b32 v0, v3, v13, v7 +; GFX9-NEXT: v_or3_b32 v1, v5, v15, v9 +; GFX9-NEXT: v_or3_b32 v2, v6, v17, v10 +; GFX9-NEXT: v_or3_b32 v3, v12, v8, v11 +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v9, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 8 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v17, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 2, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_or_b32_sdwa v15, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v11 @@ -6797,58 +6773,59 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v14 ; GFX8-NEXT: v_or_b32_e32 v14, v15, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX8-NEXT: v_or_b32_sdwa v6, v7, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v18 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v14, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX8-NEXT: v_or_b32_e32 v7, v17, v7 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v13 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v11 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v16 ; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v8 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v3, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v9, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v8 +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -6969,13 +6946,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v8, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 +; GFX10-NEXT: v_mov_b32_e32 v8, 8 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff ; GFX10-NEXT: v_mov_b32_e32 v9, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -6986,15 +6963,15 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v6, v1, v18 +; GFX10-NEXT: v_and_or_b32 v6, v6, s2, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 @@ -7030,19 +7007,19 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 +; GFX10-NEXT: v_and_or_b32 v11, v4, v1, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 +; GFX10-NEXT: v_or3_b32 v0, v2, v12, v5 ; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 +; GFX10-NEXT: v_or3_b32 v2, v11, v16, v7 ; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 +; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -984,15 +984,15 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -1006,15 +1006,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1061,32 +1061,32 @@ ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: @@ -1543,15 +1543,15 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -1565,15 +1565,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1620,32 +1620,32 @@ ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: @@ -1681,8 +1681,8 @@ ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -1699,8 +1699,8 @@ ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -674,10 +674,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_inc_rtn_u64 v[2:3], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_ret_i64: @@ -688,10 +687,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_inc_rtn_u64 v[2:3], v2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -739,10 +737,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_inc_rtn_u64 v[2:3], v2, v[0:1] offset:32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: lds_atomic_inc_ret_i64_offset: @@ -753,10 +750,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_inc_rtn_u64 v[2:3], v2, v[0:1] offset:32 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -893,11 +889,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v1, v[0:1], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64: @@ -905,11 +900,10 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc +; GFX10-NEXT: global_atomic_inc_x2 v[2:3], v1, v[0:1], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -956,11 +950,10 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v1, v[0:1], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset: @@ -968,11 +961,10 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: global_atomic_inc_x2 v[2:3], v1, v[0:1], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -1008,9 +1000,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v1, v[0:1], s[0:1] glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64: @@ -1018,9 +1009,8 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v1, v[0:1], s[0:1] glc ; GFX10-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void @@ -1058,9 +1048,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v1, v[0:1], s[0:1] offset:32 glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset: @@ -1068,9 +1057,8 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v1, v[0:1], s[0:1] offset:32 glc ; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) @@ -1082,15 +1070,15 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -1104,15 +1092,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1125,25 +1113,25 @@ ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:40 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1158,53 +1146,53 @@ ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, 42 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[0:1] offset:40 glc +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:40 glc ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1527,8 +1515,8 @@ ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -1545,8 +1533,8 @@ ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 @@ -1562,30 +1550,28 @@ ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, 9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX9-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_inc_rtn_u64 v[3:4], v3, v[1:2] offset:16 ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] -; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[3:4], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 9 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 -; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: ds_inc_rtn_u64 v[3:4], v3, v[1:2] offset:16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dword v3, v0, s[2:3] -; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[3:4], s[0:1] ; GFX10-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -1763,15 +1749,15 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -1785,15 +1771,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1806,18 +1792,18 @@ ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:40 glc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1830,12 +1816,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 40 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -1857,32 +1843,32 @@ ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_endpgm ; @@ -1890,14 +1876,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:40 glc ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: @@ -1907,12 +1893,12 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 40 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -19,11 +19,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], 0x1000 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -49,11 +46,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_mov_b64 s[4:5], 0x1000 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 -; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -66,9 +66,8 @@ ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -546,9 +546,8 @@ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -604,9 +603,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -772,22 +770,21 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -74,9 +74,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -236,9 +235,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -271,9 +269,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -306,9 +303,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -342,9 +338,8 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -378,12 +373,10 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: %11:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED: %11:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 @@ -398,9 +391,8 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -424,28 +416,27 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; UNPACKED: %13:vgpr_32, dead %49:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: %13:vgpr_32, dead %49:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY5]], implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec + ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; UNPACKED: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; UNPACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; UNPACKED: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; UNPACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; UNPACKED: bb.2: ; UNPACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec - ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; UNPACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec + ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -471,22 +462,21 @@ ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6 ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; PACKED: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; PACKED: %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; PACKED: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; PACKED: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; PACKED: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; PACKED: bb.2: ; PACKED: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; PACKED: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; PACKED: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec + ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -241,9 +241,8 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -270,22 +269,21 @@ ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %15:vgpr_32, dead %35:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec - ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %15:vgpr_32, dead %35:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY11]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -515,9 +515,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -573,9 +572,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -669,9 +667,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 @@ -693,22 +690,21 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: %14:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec @@ -738,25 +734,24 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY6]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -52,9 +52,8 @@ ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -45,9 +45,8 @@ ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -362,9 +362,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -420,9 +419,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4096 @@ -588,22 +586,21 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; CHECK: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY8]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec + ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -2060,8 +2060,10 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: %9:vgpr_32, dead %14:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2073,8 +2075,10 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX7: %9:vgpr_32, dead %14:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2203,9 +2207,11 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX6: %9:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2233,9 +2239,11 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4068, implicit $exec + ; GFX7: %9:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2449,11 +2457,13 @@ ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX6: %9:vgpr_32, dead %34:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2497,11 +2507,13 @@ ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4036, implicit $exec + ; GFX7: %9:vgpr_32, dead %34:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -3644,7 +3656,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX6: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3661,8 +3675,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -3695,7 +3709,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5000, implicit $exec + ; GFX7: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3712,8 +3728,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -3803,7 +3819,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX6: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3820,8 +3838,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -3854,7 +3872,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4076, implicit $exec + ; GFX7: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3871,8 +3891,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -3962,7 +3982,9 @@ ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX6: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX6: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX6: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -3979,8 +4001,8 @@ ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX6: bb.3: @@ -4013,7 +4035,9 @@ ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4080, implicit $exec + ; GFX7: %9:vgpr_32, dead %40:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; GFX7: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 ; GFX7: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -4030,8 +4054,8 @@ ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN %9, [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s128), align 4) ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GFX7: bb.3: @@ -4530,9 +4554,8 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX6: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX6: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4546,9 +4569,8 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX7: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX7: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -4562,9 +4584,8 @@ ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec + ; GFX8: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1024, implicit $exec + ; GFX8: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -24,12 +24,14 @@ ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -58,9 +58,8 @@ ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -113,9 +113,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -50,9 +50,8 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -74,9 +74,8 @@ ; UNPACKED: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec ; UNPACKED: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec - ; UNPACKED: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 - ; UNPACKED: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + ; UNPACKED: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_]], [[V_AND_B32_e64_1]], implicit $exec ; UNPACKED: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec ; UNPACKED: $vgpr0 = COPY [[V_OR_B32_e64_]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -160,9 +159,8 @@ ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; PACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -176,9 +174,8 @@ ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; UNPACKED: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -108,9 +108,8 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[COPY4]], %subreg.sub1 ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -895,11 +895,11 @@ ; SI-NEXT: s_movk_i32 s2, 0x3c00 ; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 ; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s4, s3, 16 -; SI-NEXT: s_or_b32 s6, s2, s4 +; SI-NEXT: s_lshl_b32 s5, s3, 16 +; SI-NEXT: s_or_b32 s6, s2, s5 ; SI-NEXT: s_lshl_b32 s2, s2, 16 -; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_or_b32 s7, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -965,7 +965,6 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -974,10 +973,11 @@ ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_branch .LBB7_5 @@ -1031,19 +1031,19 @@ ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 -; GFX10-32-NEXT: s_wqm_b32 s3, s0 -; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_wqm_b32 s2, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s1, 0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch .LBB7_5 ; GFX10-32-NEXT: .LBB7_4: ; %.continue1 @@ -1094,7 +1094,6 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1103,11 +1102,12 @@ ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -30,8 +30,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xff +; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, %amount Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -97,8 +97,8 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -111,8 +111,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -152,8 +152,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc @@ -162,8 +162,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4095: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc @@ -177,21 +177,23 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 0 -; GFX6-NEXT: s_mov_b32 s1, 4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s1, 4 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4294967296 @@ -202,21 +204,23 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4294967297: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 4 -; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4294967297: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 4 -; GFX7-NEXT: s_mov_b32 s1, s0 -; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4294967297 @@ -227,8 +231,8 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 @@ -237,8 +241,8 @@ ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_offset4096: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 @@ -257,8 +261,8 @@ ; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -271,8 +275,8 @@ ; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -287,8 +291,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -297,8 +301,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -312,8 +316,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -322,8 +326,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -338,8 +342,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm @@ -348,8 +352,8 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm @@ -366,8 +370,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -378,8 +382,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -395,8 +399,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -408,8 +412,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -426,8 +430,8 @@ ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -439,8 +443,8 @@ ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 @@ -623,20 +627,22 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 0 -; GFX6-NEXT: s_mov_b32 s1, 4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s1, 4 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -648,20 +654,22 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_offset4294967297: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 4 -; GFX6-NEXT: s_mov_b32 s1, s0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967297: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 4 -; GFX7-NEXT: s_mov_b32 s1, s0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -1011,31 +1019,31 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) { ; GFX6-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 0 -; GFX6-NEXT: s_mov_b32 s1, 4 -; GFX6-NEXT: v_mov_b32_e32 v2, 2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 4, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v0, v[2:3], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 0 -; GFX7-NEXT: s_mov_b32 s1, 4 -; GFX7-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 0, v0 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 4, v1, vcc +; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v0, v[2:3], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 -; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4294967296 %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst @@ -1203,11 +1211,12 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr, i32 %old, i32 %in) { ; GFX6-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s0, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX6-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-NEXT: s_mov_b32 s1, 4 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s0 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -1218,11 +1227,12 @@ ; ; GFX7-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s0, 0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0, v0 ; GFX7-NEXT: v_mov_b32_e32 v4, v2 -; GFX7-NEXT: s_mov_b32 s1, 4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 4, v1, vcc +; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.div.fmas.mir @@ -43,13 +43,12 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[ICMP]](s1) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 %2:_(s32) = COPY $sgpr2 @@ -71,12 +70,11 @@ ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY5]](s32), [[COPY6]](s32), [[ICMP]](s1) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY4]](s32), [[COPY5]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $sgpr1 @@ -98,9 +96,8 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[COPY4]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[ICMP]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.kill.mir @@ -47,9 +47,8 @@ body: | bb.0: ; CHECK-LABEL: name: kill_constant_true - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[COPY]](s1) + ; CHECK: [[C:%[0-9]+]]:vcc(s1) = G_CONSTANT i1 true + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[C]](s1) %0:_(s1) = G_CONSTANT i1 true G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), %0 ... @@ -61,9 +60,8 @@ body: | bb.0: ; CHECK-LABEL: name: kill_constant_false - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false - ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[COPY]](s1) + ; CHECK: [[C:%[0-9]+]]:vcc(s1) = G_CONSTANT i1 false + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), [[C]](s1) %0:_(s1) = G_CONSTANT i1 false G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.kill), %0 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -15,8 +15,8 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -36,9 +36,9 @@ ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY7]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY6]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -59,7 +59,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -77,7 +77,7 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -104,7 +104,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: @@ -113,7 +113,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -140,7 +140,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec @@ -161,7 +161,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -1132,11 +1132,11 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4092 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -1149,11 +1149,11 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4092 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4092, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 @@ -1172,11 +1172,11 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4095 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -1189,11 +1189,11 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4095 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -1212,10 +1212,11 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4096 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -1228,10 +1229,11 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4096 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -1251,12 +1253,12 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4064 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1278,12 +1280,12 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4064 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -1312,11 +1314,12 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4068 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1338,11 +1341,12 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4068 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -1370,14 +1374,14 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4032 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1407,14 +1411,14 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4032 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[COPY4]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -1450,13 +1454,14 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4036 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; CHECK: $vgpr0 = COPY [[UV]](s32) @@ -1486,13 +1491,14 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4036 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4) ; GREEDY: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>) ; GREEDY: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>) ; GREEDY: $vgpr0 = COPY [[UV]](s32) @@ -2250,15 +2256,16 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5000 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2270,8 +2277,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2298,15 +2305,16 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 5000 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2318,8 +2326,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2352,15 +2360,16 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4076 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2372,8 +2381,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2400,15 +2409,16 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4076 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2420,8 +2430,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2454,15 +2464,16 @@ ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4080 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3, %bb.2 - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2474,8 +2485,8 @@ ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; CHECK: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.2, implicit $exec ; CHECK: bb.3: @@ -2502,15 +2513,16 @@ ; GREEDY: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080 - ; GREEDY: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4080 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; GREEDY: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; GREEDY: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>) ; GREEDY: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; GREEDY: bb.2: ; GREEDY: successors: %bb.3, %bb.2 - ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.2 + ; GREEDY: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.2 ; GREEDY: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; GREEDY: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; GREEDY: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -2522,8 +2534,8 @@ ; GREEDY: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GREEDY: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; GREEDY: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GREEDY: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4) ; GREEDY: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GREEDY: SI_WATERFALL_LOOP %bb.2, implicit $exec ; GREEDY: bb.3: @@ -2734,11 +2746,11 @@ ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -2754,11 +2766,11 @@ ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s @@ -2781,11 +2793,11 @@ ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -2801,11 +2813,11 @@ ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v @@ -2870,12 +2882,12 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY6]] + ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -2889,12 +2901,12 @@ ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GREEDY: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) - ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[C1]] + ; GREEDY: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY6]] + ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.mir @@ -60,22 +60,22 @@ ; FAST: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] - ; FAST: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; FAST: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; FAST: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C1]] + ; FAST: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; FAST: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-LABEL: name: s_buffer_load_negative_offset ; GREEDY: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 ; GREEDY: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -60 - ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] - ; GREEDY: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -60 + ; GREEDY: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C1]] + ; GREEDY: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[COPY]](<4 x s32>), [[C3]](s32), [[ADD]], [[C2]], 0, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY: S_ENDPGM 0, implicit [[AMDGPU_BUFFER_LOAD]](s32) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir @@ -47,9 +47,8 @@ body: | bb.0: ; CHECK-LABEL: name: wqm_demote_constant_true - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + ; CHECK: [[C:%[0-9]+]]:vcc(s1) = G_CONSTANT i1 true + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[C]](s1) %0:_(s1) = G_CONSTANT i1 true G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 ... @@ -61,9 +60,8 @@ body: | bb.0: ; CHECK-LABEL: name: wqm_demote_constant_false - ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false - ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1) - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1) + ; CHECK: [[C:%[0-9]+]]:vcc(s1) = G_CONSTANT i1 false + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[C]](s1) %0:_(s1) = G_CONSTANT i1 false G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0 ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir @@ -10,9 +10,8 @@ liveins: $vgpr0_vgpr1 ; CHECK-LABEL: name: test_constant_s32_vgpr_use ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32)) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK: G_STORE [[C]](s32), [[COPY]](p1) :: (store (s32)) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 1 G_STORE %1, %0 :: (store (s32)) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -554,9 +554,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -603,15 +602,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -658,8 +656,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 1 @@ -681,9 +679,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -730,15 +727,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_addm1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 -1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -785,8 +781,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 -1 @@ -808,9 +804,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -857,15 +852,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add16 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -912,8 +906,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_CONSTANT i32 16 @@ -935,9 +929,8 @@ ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -967,17 +960,16 @@ ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv_idx_add1 ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1007,9 +999,9 @@ ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 @@ -1032,9 +1024,8 @@ ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1081,15 +1072,14 @@ ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE64: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE64: $vgpr0 = COPY [[COPY2]](s32) ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv_idx_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1136,8 +1126,8 @@ ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15 ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]] ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) - ; WAVE32: $vgpr0 = COPY [[COPY3]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32) + ; WAVE32: $vgpr0 = COPY [[COPY2]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_CONSTANT i32 1 @@ -1159,9 +1149,8 @@ ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE64: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1191,17 +1180,16 @@ ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv_add1 ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]] + ; WAVE32: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[C]] ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]] @@ -1231,9 +1219,9 @@ ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]] ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]] ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]] - ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) - ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) - ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32) + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32) + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-phi.mir @@ -306,17 +306,16 @@ ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $sgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_ss_vcc_sbranch @@ -326,17 +325,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $sgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -377,17 +375,16 @@ ; FAST: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_sv_vcc_sbranch @@ -397,17 +394,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -448,17 +444,16 @@ ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vs_vcc_sbranch @@ -468,17 +463,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: @@ -519,17 +513,16 @@ ; FAST: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; FAST: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; FAST: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; FAST: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; FAST: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; FAST: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; FAST: G_BRCOND [[ICMP]](s1), %bb.1 ; FAST: G_BR %bb.2 ; FAST: bb.1: ; FAST: successors: %bb.2(0x80000000) - ; FAST: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; FAST: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; FAST: G_BR %bb.2 ; FAST: bb.2: - ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; FAST: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; FAST: $vgpr0 = COPY [[PHI]](s32) ; FAST: S_SETPC_B64 undef $sgpr30_sgpr31 ; GREEDY-LABEL: name: phi_s32_vv_vcc_sbranch @@ -539,17 +532,16 @@ ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; GREEDY: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GREEDY: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[COPY3]] + ; GREEDY: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] ; GREEDY: G_BRCOND [[ICMP]](s1), %bb.1 ; GREEDY: G_BR %bb.2 ; GREEDY: bb.1: ; GREEDY: successors: %bb.2(0x80000000) - ; GREEDY: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; GREEDY: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; GREEDY: G_BR %bb.2 ; GREEDY: bb.2: - ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY4]](s32), %bb.1 + ; GREEDY: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[COPY]](s32), %bb.0, [[COPY3]](s32), %bb.1 ; GREEDY: $vgpr0 = COPY [[PHI]](s32) ; GREEDY: S_SETPC_B64 undef $sgpr30_sgpr31 bb.0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir @@ -13,7 +13,7 @@ ; CHECK-LABEL: name: gep_p1_s_k ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1 - ; CHECK: [[GEP:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s64) = G_CONSTANT i64 1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -30,7 +30,7 @@ ; CHECK-LABEL: name: gep_p1_s_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3 - ; CHECK: [[GEP:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s64) = COPY $sgpr2_sgpr3 %2:_(p1) = G_PTR_ADD %0, %1 @@ -46,9 +46,8 @@ ; CHECK-LABEL: name: gep_p1_v_k ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64) - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 1 + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -66,7 +65,7 @@ ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64) - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY2]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY2]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $sgpr0_sgpr1 %2:_(p1) = G_PTR_ADD %0, %1 @@ -83,7 +82,7 @@ ; CHECK-LABEL: name: gep_p1_v_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; CHECK: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) + ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[COPY1]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(p1) = G_PTR_ADD %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrmask.mir @@ -46,9 +46,8 @@ ; CHECK-LABEL: name: ptrmask_p1_v_k ; CHECK: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64) - ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p1) = G_PTRMASK [[COPY]], [[COPY1]](s64) + ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 1 + ; CHECK: [[PTRMASK:%[0-9]+]]:vgpr(p1) = G_PTRMASK [[COPY]], [[C]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 1 %2:_(p1) = G_PTRMASK %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sbfx.mir @@ -37,11 +37,9 @@ ; CHECK-LABEL: name: test_sbfx_s32_vii ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK: $vgpr0 = COPY [[SBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -139,14 +137,12 @@ ; CHECK-LABEL: name: test_sbfx_s64_vii_small ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK: [[SBFX:%[0-9]+]]:vgpr(s32) = G_SBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 ; CHECK: [[ASHR1:%[0-9]+]]:vgpr(s32) = G_ASHR [[SBFX]], [[C3]](s32) ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[SBFX]](s32), [[ASHR1]](s32) @@ -168,11 +164,9 @@ ; CHECK-LABEL: name: test_sbfx_s64_vii_big ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK: [[ASHR:%[0-9]+]]:vgpr(s64) = G_ASHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ubfx.mir @@ -37,11 +37,9 @@ ; CHECK-LABEL: name: test_ubfx_s32_vii ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[COPY1]](s32), [[COPY2]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 10 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]] ; CHECK: $vgpr0 = COPY [[UBFX]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 10 @@ -139,14 +137,12 @@ ; CHECK-LABEL: name: test_ubfx_s64_vii_small ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 31 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[COPY2]] + ; CHECK: [[UBFX:%[0-9]+]]:vgpr(s32) = G_UBFX [[UV]], [[C2]](s32), [[C1]] ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[UBFX]](s32), [[C2]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 @@ -166,11 +162,9 @@ ; CHECK-LABEL: name: test_ubfx_s64_vii_big ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 40 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[COPY1]](s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 40 + ; CHECK: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[COPY]], [[C]](s32) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LSHR]](s64) ; CHECK: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir @@ -21,17 +21,17 @@ ; CHECK: %agpr:agpr(s32) = COPY $agpr0 ; CHECK: %voffset:vgpr(s32) = COPY $vgpr1 ; CHECK: %zero:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %zero(s32) + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY %agpr(s32) ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.0, %9, %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY]](s32), implicit $exec ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[COPY]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) + ; CHECK: G_AMDGPU_BUFFER_STORE %val(s32), %rsrc(<4 x s32>), [[C]](s32), %voffset, [[V_READFIRSTLANE_B32_]], 0, 0, 0 :: (dereferenceable store (s32), addrspace 4) ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: SI_WATERFALL_LOOP %bb.1, implicit $exec ; CHECK: .2: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-xor.mir @@ -760,12 +760,10 @@ liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: xor_i1_vcc_constant ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]] - ; CHECK: [[C1:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[C1]](s1) - ; CHECK: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[COPY2]] + ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[C]] + ; CHECK: [[C1:%[0-9]+]]:vcc(s1) = G_CONSTANT i1 true + ; CHECK: [[XOR:%[0-9]+]]:vcc(s1) = G_XOR [[ICMP]], [[C1]] ; CHECK: S_NOP 0, implicit [[XOR]](s1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4152,7 +4152,9 @@ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], 0, v0 @@ -4168,7 +4170,9 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], 0, v0 @@ -4184,7 +4188,9 @@ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], 0, v0 @@ -4198,16 +4204,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v8, s5 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4224,7 +4232,9 @@ ; GFX6-NEXT: s_addc_u32 s5, s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_add_u32 s0, s2, 0 @@ -4252,7 +4262,9 @@ ; GFX8-NEXT: s_addc_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_add_u32 s0, s2, 0 @@ -4280,7 +4292,9 @@ ; GFX9-NEXT: s_addc_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_add_u32 s0, s2, 0 @@ -4300,25 +4314,27 @@ ; ; GFX10-LABEL: s_saddsat_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_add_u32 s4, s0, s2 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_addc_u32 s5, s1, s3 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX10-NEXT: s_ashr_i32 s1, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s3, s1, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, vcc_lo, s0 +; GFX10-NEXT: s_add_u32 s0, s1, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) @@ -4330,9 +4346,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v5, v4 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 @@ -4346,9 +4364,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 @@ -4362,9 +4382,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 @@ -4376,16 +4398,18 @@ ; ; GFX10-LABEL: saddsat_i64_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] -; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[4:5] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s1, v6, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -4399,12 +4423,14 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], 0, v0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 +; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4415,12 +4441,14 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], 0, v0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4431,28 +4459,32 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 0, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_i64_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s1, v6, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -4546,19 +4578,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s5, v0, v1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_add_co_u32 v2, s7, v4, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, v4, v1, s7 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo @@ -5518,42 +5551,43 @@ ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, s4, v4, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s4, v5, v13, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s4, v6, v14, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s4, v7, v15, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[10:11], v[4:5] -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v17 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo +; GFX10-NEXT: v_bfrev_b32_e32 v18, 1 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[12:13], v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[6:7] -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v2, 0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 -; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v2, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v19, 0, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v5, s4 +; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -8,11 +8,13 @@ ; CHECK-LABEL: v_sdiv_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v7, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -174,17 +176,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -201,8 +203,10 @@ ; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -684,13 +688,15 @@ ; CGP-LABEL: v_sdiv_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v10, v0 -; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v2, v0 +; CGP-NEXT: v_mov_b32_e32 v3, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -852,22 +858,24 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v4, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1029,17 +1037,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -2477,13 +2485,15 @@ ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[7:8], v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -2645,17 +2655,17 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -2960,12 +2970,14 @@ ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v12, v0 +; CGP-NEXT: v_mov_b32_e32 v13, v0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[12:13] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -3127,22 +3139,24 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[8:9] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -3304,17 +3318,17 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -8,7 +8,6 @@ ; GFX6-LABEL: v_shl_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -123,7 +122,6 @@ ; GCN-LABEL: v_shl_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -131,7 +129,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, %amount @@ -567,7 +564,6 @@ ; GFX6-LABEL: v_shl_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -665,7 +661,6 @@ define amdgpu_ps half @shl_i16_sv(i16 inreg %value, i16 %amount) { ; GFX6-LABEL: shl_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -380,7 +380,7 @@ ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -8,11 +8,13 @@ ; CHECK-LABEL: v_srem_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: v_mov_b32_e32 v4, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v7, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -172,15 +174,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -197,8 +199,10 @@ ; CHECK-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[0:1] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_ashr_i32 s6, s3, 31 @@ -672,13 +676,15 @@ ; CGP-LABEL: v_srem_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v10, v0 -; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: v_mov_b32_e32 v8, v2 ; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v2, v0 +; CGP-NEXT: v_mov_b32_e32 v3, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -838,20 +844,22 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v4, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1011,15 +1019,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -2441,13 +2449,15 @@ ; CHECK-LABEL: v_srem_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: v_mov_b32_e32 v3, v0 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[7:8], v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -2607,15 +2617,15 @@ ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -2916,12 +2926,14 @@ ; CGP-NEXT: v_mov_b32_e32 v5, v2 ; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 -; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v9, v1 ; CGP-NEXT: v_mov_b32_e32 v0, 0 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v12, v0 +; CGP-NEXT: v_mov_b32_e32 v13, v0 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[12:13] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -3081,20 +3093,22 @@ ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[8:9] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -3254,15 +3268,15 @@ ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4138,7 +4138,9 @@ ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], 0, v0 @@ -4154,7 +4156,9 @@ ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], 0, v0 @@ -4170,7 +4174,9 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], v[2:3], v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], 0, v0 @@ -4184,16 +4190,18 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, v4 +; GFX10-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, s5, v8, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v8, s5 +; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4210,7 +4218,9 @@ ; GFX6-NEXT: s_subb_u32 s5, s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: s_add_u32 s0, s2, 0 @@ -4238,7 +4248,9 @@ ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: s_add_u32 s0, s2, 0 @@ -4266,7 +4278,9 @@ ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], v[0:1] ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: s_add_u32 s0, s2, 0 @@ -4286,25 +4300,27 @@ ; ; GFX10-LABEL: s_ssubsat_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_sub_u32 s4, s0, s2 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_and_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_subb_u32 s5, s1, s3 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 -; GFX10-NEXT: s_ashr_i32 s2, s5, 31 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] +; GFX10-NEXT: s_ashr_i32 s1, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_xor_b32 s3, s1, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s3 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s1, s2, 0x80000000 +; GFX10-NEXT: s_xor_b32 s2, vcc_lo, s0 +; GFX10-NEXT: s_add_u32 s0, s1, 0 +; GFX10-NEXT: s_cselect_b32 s3, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) @@ -4316,9 +4332,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 +; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v5, v4 ; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 @@ -4332,9 +4350,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, v4 ; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 @@ -4348,9 +4368,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v4 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 @@ -4362,16 +4384,18 @@ ; ; GFX10-LABEL: ssubsat_i64_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1] -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] -; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[4:5] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s1, v6, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -4385,12 +4409,14 @@ ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], 0, v0 -; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 +; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: ; return to shader part epilog @@ -4401,12 +4427,14 @@ ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], 0, v0 -; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 +; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: ; return to shader part epilog @@ -4417,28 +4445,32 @@ ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 0, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_i64_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, s1, v6, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> @@ -4532,19 +4564,20 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo ; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v9 +; GFX10-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 +; GFX10-NEXT: v_add_co_u32 v5, s5, v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s5, v0, v1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 +; GFX10-NEXT: v_add_co_u32 v2, s7, v4, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, v4, v1, s7 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo ; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo @@ -5534,6 +5567,7 @@ ; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo ; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] +; GFX10-NEXT: v_bfrev_b32_e32 v20, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo @@ -5542,44 +5576,44 @@ ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo -; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v8, s4, v4, v12 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v5, v13, s4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v6, v14, s4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s4, v7, v15, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[8:9], v[4:5] ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v19 +; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v2, 0 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[10:11], v[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[6:7] ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, 0, v[12:13] -; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[14:15] ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4 ; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 ; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v20, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v13, v12, s5 -; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v5, s4 +; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v20, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v4, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v5, s5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -10,9 +10,11 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: v_mov_b32_e32 v5, v1 -; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v7, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -151,25 +153,25 @@ ; CHECK-NEXT: s_cbranch_execz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -186,8 +188,10 @@ ; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -624,9 +628,11 @@ ; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v2 ; CGP-NEXT: v_mov_b32_e32 v9, v3 -; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v2, v0 +; CGP-NEXT: v_mov_b32_e32 v3, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -765,30 +771,32 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v2, v0, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v4, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -927,25 +935,25 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -1068,11 +1076,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v4, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[7:8], v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1211,25 +1221,25 @@ ; CHECK-NEXT: s_cbranch_execz .LBB7_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1507,9 +1517,11 @@ ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v12, v0 +; CGP-NEXT: v_mov_b32_e32 v13, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[12:13] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1648,30 +1660,32 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[8:9] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1810,25 +1824,25 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -10,9 +10,11 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v4, v0 ; CHECK-NEXT: v_mov_b32_e32 v5, v1 -; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: v_mov_b32_e32 v7, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -150,23 +152,23 @@ ; CHECK-NEXT: s_cbranch_execz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -183,8 +185,10 @@ ; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, -1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[4:5] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, s[6:7], v[0:1] ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -616,9 +620,11 @@ ; CGP-NEXT: v_mov_b32_e32 v11, v1 ; CGP-NEXT: v_mov_b32_e32 v8, v2 ; CGP-NEXT: v_mov_b32_e32 v9, v3 -; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 +; CGP-NEXT: v_mov_b32_e32 v2, v0 +; CGP-NEXT: v_mov_b32_e32 v3, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[2:3] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -756,28 +762,30 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v0 +; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: .LBB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v9, v7 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v4, v2 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -915,23 +923,23 @@ ; CGP-NEXT: s_cbranch_execz .LBB2_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: .LBB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] @@ -1100,11 +1108,11 @@ ; GISEL-LABEL: v_urem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: s_sub_u32 s6, 0, s8 -; GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb +; GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v6, v4 ; GISEL-NEXT: s_and_b32 s4, s4, 1 ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 @@ -1611,11 +1619,13 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v4, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2 +; CHECK-NEXT: v_or_b32_e32 v8, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v7, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, v[7:8], v[0:1] ; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -1753,23 +1763,23 @@ ; CHECK-NEXT: s_cbranch_execz .LBB7_4 ; CHECK-NEXT: ; %bb.3: ; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0 +; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 ; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: .LBB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -2045,9 +2055,11 @@ ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 ; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 ; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 -; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 +; CGP-NEXT: v_mov_b32_e32 v12, v0 +; CGP-NEXT: v_mov_b32_e32 v13, v0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[12:13] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -2185,28 +2197,30 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_4 ; CGP-NEXT: ; %bb.3: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 -; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v0, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_mov_b32_e32 v1, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: .LBB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v3, v7, v11 ; CGP-NEXT: v_mov_b32_e32 v2, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v2 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[8:9] ; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -2344,23 +2358,23 @@ ; CGP-NEXT: s_cbranch_execz .LBB8_8 ; CGP-NEXT: ; %bb.7: ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v2 +; CGP-NEXT: v_mul_hi_u32 v4, v2, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v3, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: .LBB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -258,12 +258,19 @@ } define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 { -; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr: -; FIXEDABI: ; %bb.0: -; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0 -; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0 -; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc -; FIXEDABI-NEXT: s_endpgm +; FIXEDABI-SDAG-LABEL: marked_kernel_nokernargs_implicitarg_ptr: +; FIXEDABI-SDAG: ; %bb.0: +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; FIXEDABI-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc +; FIXEDABI-SDAG-NEXT: s_endpgm +; +; FIXEDABI-GISEL-LABEL: marked_kernel_nokernargs_implicitarg_ptr: +; FIXEDABI-GISEL: ; %bb.0: +; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; FIXEDABI-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc +; FIXEDABI-GISEL-NEXT: s_endpgm %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr ret void Index: llvm/test/CodeGen/AMDGPU/constrained-shift.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -16,7 +16,6 @@ ; GISEL-LABEL: csh_16: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 15, v1 ; GISEL-NEXT: v_lshlrev_b16_e32 v2, v1, v0 ; GISEL-NEXT: v_lshrrev_b16_e32 v3, v1, v0 ; GISEL-NEXT: v_ashrrev_i16_e32 v0, v1, v0 @@ -45,7 +44,6 @@ ; GISEL-LABEL: csh_32: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, v1, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v0, v1, v0 @@ -351,7 +349,6 @@ ; GISEL-LABEL: cshl_or: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshl_or_b32 v0, v0, v1, v0 ; GISEL-NEXT: s_setpc_b64 s[30:31] %and = and i32 %b, 31 @@ -370,7 +367,6 @@ ; GISEL-LABEL: cshl_add: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshl_add_u32 v0, v0, v1, v2 ; GISEL-NEXT: s_setpc_b64 s[30:31] %and = and i32 %b, 31 @@ -389,8 +385,7 @@ ; GISEL-LABEL: add_cshl: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v2, 31, v1 -; GISEL-NEXT: v_add_lshl_u32 v0, v0, v1, v2 +; GISEL-NEXT: v_add_lshl_u32 v0, v0, v1, v1 ; GISEL-NEXT: s_setpc_b64 s[30:31] %add = add i32 %a, %b %and = and i32 %b, 31 Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -978,6 +978,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -985,8 +986,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1074,6 +1074,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1081,8 +1082,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1392,6 +1392,7 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 @@ -1399,8 +1400,7 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -773,13 +773,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -853,13 +853,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -944,12 +944,12 @@ ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 +; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1302,13 +1302,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1388,13 +1388,13 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/cttz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz.ll +++ llvm/test/CodeGen/AMDGPU/cttz.ll @@ -971,6 +971,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -978,8 +979,7 @@ ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -1067,6 +1067,7 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1074,8 +1075,7 @@ ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1029,49 +1029,53 @@ ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:7 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v0, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v0, s[2:3] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v9, v0, s[2:3] offset:6 ; GFX9-GISEL-NEXT: s_movk_i32 s2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(7) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) ; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) +; GFX9-GISEL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v3, 8, v3 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) +; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(3) +; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) ; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16 ; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v2, 16, v0 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 -; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4 -; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v5, 16, v4 +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v5, v3 +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v2 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v4, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, i64 addrspace(1)* %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone Index: llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -1,10 +1,11 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs --stress-regalloc=10 < %s | FileCheck -check-prefixes=GCN,GISEL %s ; GCN-LABEL: {{^}}test_remat_sgpr: ; GCN-NOT: v_writelane_b32 ; GCN: {{^}}[[LOOP:.LBB[0-9_]+]]: -; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; SDAG-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x +; GISEL-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0