diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -99,7 +99,7 @@ bool selectG_EXTRACT(MachineInstr &I) const; bool selectG_MERGE_VALUES(MachineInstr &I) const; bool selectG_UNMERGE_VALUES(MachineInstr &I) const; - bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const; + bool selectG_BUILD_VECTOR(MachineInstr &I) const; bool selectG_PTR_ADD(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; bool selectG_INSERT(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -606,30 +606,42 @@ return true; } -bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( - MachineInstr &MI) const { - if (selectImpl(MI, *CoverageInfo)) - return true; +bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { + assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC || + MI.getOpcode() == AMDGPU::G_BUILD_VECTOR); - const LLT S32 = LLT::scalar(32); - const LLT V2S16 = LLT::fixed_vector(2, 16); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + LLT SrcTy = MRI->getType(Src0); + const unsigned SrcSize = SrcTy.getSizeInBits(); + // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. + if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) { + return selectG_MERGE_VALUES(MI); + } + + // Selection logic below is for V2S16 only. + // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. Register Dst = MI.getOperand(0).getReg(); - if (MRI->getType(Dst) != V2S16) - return false; + if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) || + (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC && + SrcTy != LLT::scalar(32))) + return selectImpl(MI, *CoverageInfo); const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); - if (DstBank->getID() != AMDGPU::SGPRRegBankID) + if (DstBank->getID() == AMDGPU::AGPRRegBankID) return false; - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); - if (MRI->getType(Src0) != S32) - return false; + assert(DstBank->getID() == AMDGPU::SGPRRegBankID || + DstBank->getID() == AMDGPU::VGPRRegBankID); + const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *BB = MI.getParent(); + // First, before trying TableGen patterns, check if both sources are + // constants. In those cases, we can trivially compute the final constant + // and emit a simple move. auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); if (ConstSrc1) { auto ConstSrc0 = @@ -639,22 +651,50 @@ const int64_t K1 = ConstSrc1->Value.getSExtValue(); uint32_t Lo16 = static_cast(K0) & 0xffff; uint32_t Hi16 = static_cast(K1) & 0xffff; + uint32_t Imm = Lo16 | (Hi16 << 16); + + // VALU + if (IsVector) { + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm); + MI.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI); + } - BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) - .addImm(Lo16 | (Hi16 << 16)); + // SALU + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm); MI.eraseFromParent(); return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); } } + // Now try TableGen patterns. + if (selectImpl(MI, *CoverageInfo)) + return true; + // TODO: This should probably be a combine somewhere - // (build_vector_trunc $src0, undef -> copy $src0 + // (build_vector $src0, undef) -> copy $src0 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { MI.setDesc(TII.get(AMDGPU::COPY)); MI.removeOperand(2); - return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) && - RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI); + const auto &RC = + IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; + return RBI.constrainGenericRegister(Dst, RC, *MRI) && + RBI.constrainGenericRegister(Src0, RC, *MRI); + } + + // TODO: Can be improved? + if (IsVector) { + Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) + .addImm(0xFFFF) + .addReg(Src0); + BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) + .addReg(Src1) + .addImm(16) + .addReg(TmpReg); + MI.eraseFromParent(); + return true; } Register ShiftSrc0; @@ -663,13 +703,13 @@ // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. // - // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) + // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) // => (S_PACK_HH_B32_B16 $src0, $src1) - // (build_vector_trunc (lshr_oneuse SReg_32:$src0, 16), $src1) + // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) // => (S_PACK_HL_B32_B16 $src0, $src1) - // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16)) + // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) // => (S_PACK_LH_B32_B16 $src0, $src1) - // (build_vector_trunc $src0, $src1) + // (build_vector $src0, $src1) // => (S_PACK_LL_B32_B16 $src0, $src1) bool Shift0 = mi_match( @@ -687,6 +727,8 @@ Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); } else if (Shift0) { + auto ConstSrc1 = + getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); if (ConstSrc1 && ConstSrc1->Value == 0) { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) @@ -3505,13 +3547,13 @@ case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); case TargetOpcode::G_MERGE_VALUES: - case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_CONCAT_VECTORS: return selectG_MERGE_VALUES(I); case TargetOpcode::G_UNMERGE_VALUES: return selectG_UNMERGE_VALUES(I); + case TargetOpcode::G_BUILD_VECTOR: case TargetOpcode::G_BUILD_VECTOR_TRUNC: - return selectG_BUILD_VECTOR_TRUNC(I); + return selectG_BUILD_VECTOR(I); case TargetOpcode::G_PTR_ADD: if (selectImpl(I, *CoverageInfo)) return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -726,7 +726,6 @@ if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampMaxNumElements(0, S16, 2) .clampScalar(0, S16, S64) .scalarize(0); } else if (ST.has16BitInsts()) { @@ -1442,7 +1441,7 @@ VecTy.getSizeInBits() <= MaxRegisterSize && IdxTy.getSizeInBits() == 32; }) - .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 16)), bitcastToVectorElement32(VecTypeIdx)) //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) .bitcastIf( @@ -1461,7 +1460,7 @@ VecTypeIdx, LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); }) - .clampScalar(EltTypeIdx, S32, S64) + .clampScalar(EltTypeIdx, S16, S64) .clampScalar(VecTypeIdx, S32, S64) .clampScalar(IdxTypeIdx, S32, S32) .clampMaxNumElements(VecTypeIdx, S32, 32) @@ -1526,13 +1525,11 @@ BuildVector // FIXME: Should probably widen s1 vectors straight to s32 .minScalarOrElt(0, S16) - // Widen source elements and produce a G_BUILD_VECTOR_TRUNC - .minScalar(1, S32); + .minScalar(1, S16); getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) .legalFor({V2S16, S32}) .lower(); - BuildVector.minScalarOrElt(0, S32); } else { BuildVector.customFor({V2S16, S16}); BuildVector.minScalarOrElt(0, S32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2614,69 +2614,6 @@ break; } - case AMDGPU::G_BUILD_VECTOR: - case AMDGPU::G_BUILD_VECTOR_TRUNC: { - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - if (DstTy != LLT::fixed_vector(2, 16)) - break; - - assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); - substituteSimpleCopyRegs(OpdMapper, 1); - substituteSimpleCopyRegs(OpdMapper, 2); - - const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; - if (DstBank == &AMDGPU::SGPRRegBank) - break; // Can use S_PACK_* instructions. - - MachineIRBuilder B(MI); - - Register Lo = MI.getOperand(1).getReg(); - Register Hi = MI.getOperand(2).getReg(); - const LLT S32 = LLT::scalar(32); - - const RegisterBank *BankLo = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - const RegisterBank *BankHi = - OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; - - Register ZextLo; - Register ShiftHi; - - if (Opc == AMDGPU::G_BUILD_VECTOR) { - ZextLo = B.buildZExt(S32, Lo).getReg(0); - MRI.setRegBank(ZextLo, *BankLo); - - Register ZextHi = B.buildZExt(S32, Hi).getReg(0); - MRI.setRegBank(ZextHi, *BankHi); - - auto ShiftAmt = B.buildConstant(S32, 16); - MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); - - ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); - MRI.setRegBank(ShiftHi, *BankHi); - } else { - Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); - MRI.setRegBank(MaskLo, *BankLo); - - auto ShiftAmt = B.buildConstant(S32, 16); - MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); - - ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); - MRI.setRegBank(ShiftHi, *BankHi); - - ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); - MRI.setRegBank(ZextLo, *BankLo); - } - - auto Or = B.buildOr(S32, ZextLo, ShiftHi); - MRI.setRegBank(Or.getReg(0), *DstBank); - - B.buildBitcast(DstReg, Or); - MI.eraseFromParent(); - return; - } case AMDGPU::G_EXTRACT_VECTOR_ELT: { SmallVector DstRegs(OpdMapper.getVRegs(0)); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-concat-vectors.mir @@ -21,16 +21,17 @@ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C3]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR2]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR3]], [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SHL]](<2 x s16>), [[SHL1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -788,30 +788,23 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: @@ -819,24 +812,18 @@ ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-CONTRACT-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-CONTRACT-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-CONTRACT-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: @@ -844,30 +831,23 @@ ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v3, v8, 16, v3 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v4, v1 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: @@ -875,24 +855,18 @@ ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-UNSAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-UNSAFE-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-UNSAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: @@ -901,29 +875,23 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: @@ -933,22 +901,17 @@ ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 +; GFX10-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-CONTRACT-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-CONTRACT-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-CONTRACT-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-CONTRACT-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: @@ -957,29 +920,23 @@ ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-DENORM-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DENORM-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: @@ -989,22 +946,17 @@ ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 +; GFX10-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-UNSAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-UNSAFE-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v4, v8, 16, v4 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-UNSAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-UNSAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -450,12 +450,11 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -471,12 +470,11 @@ ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul: @@ -486,11 +484,13 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -508,11 +508,13 @@ ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y @@ -527,12 +529,11 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -548,12 +549,11 @@ ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_mul_rhs: @@ -563,11 +563,13 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -585,11 +587,13 @@ ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -238,12 +238,11 @@ ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -259,12 +258,11 @@ ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul: @@ -274,11 +272,13 @@ ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -296,11 +296,13 @@ ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5 -; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -239,6 +239,8 @@ ; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264 @@ -307,13 +309,9 @@ ; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6 -; GCN-NEXT: v_lshrrev_b32_e64 v5, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0x100, v5 -; GCN-NEXT: v_add_u32_e32 v0, v5, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x7f, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v16, v20 ; GCN-NEXT: v_mov_b32_e32 v17, v21 @@ -331,7 +329,7 @@ ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -349,8 +347,6 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(16) -; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, <128 x i16> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -6,42 +6,110 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s2, s4, 1 -; GCN-NEXT: s_cmp_eq_u32 s2, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_and_b32 s1, s4, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_and_b32 s0, s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_and_b32 s0, s4, 3 +; GFX7-NEXT: v_mov_b32_e32 v0, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s2, s4, 3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, 8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:12 +; GFX10-NEXT: buffer_load_ushort v0, v2, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_and_b32 s2, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 1 -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_load_u16 v0, v2, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -52,68 +120,100 @@ ; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s0, 8 +; GFX10-NEXT: buffer_load_ushort v0, v2, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_load_u16 v0, v2, off offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -125,39 +225,45 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v2 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v2 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v2 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx: @@ -165,13 +271,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, 5, s32 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-NEXT: buffer_load_ushort v0, v2, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx: @@ -179,14 +288,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: scratch_load_u16 v0, v2, s32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -194,49 +303,109 @@ } define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:12 +; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:12 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v0 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_load_u16 v0, v2, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx @@ -528,54 +697,127 @@ } define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { -; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_lshr_b32 s5, s4, 1 -; GCN-NEXT: s_cmp_eq_u32 s5, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s0, s1, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 2 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_cmp_eq_u32 s5, 3 -; GCN-NEXT: s_cselect_b32 s0, s3, s0 -; GCN-NEXT: s_and_b32 s1, s4, 1 -; GCN-NEXT: s_lshl_b32 s1, s1, 4 -; GCN-NEXT: s_lshr_b32 s0, s0, s1 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_and_b32 s0, s4, 7 +; GFX8-NEXT: v_mov_b32_e32 v0, 16 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_and_b32 s0, s4, 7 +; GFX7-NEXT: v_mov_b32_e32 v0, 16 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: s_cmp_eq_u32 s5, 1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 7 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s0, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: s_cselect_b32 s0, s2, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: s_cselect_b32 s0, s3, s0 -; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 +; GFX10-NEXT: buffer_load_ushort v0, v4, s[8:11], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s5, s4, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s0, s1, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: s_cselect_b32 s0, s2, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_and_b32 s1, s4, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_load_u16 v0, v4, off offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx @@ -586,34 +828,46 @@ ; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 7 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_lshl_b32 s0, s1, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 7 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_lshl_b32 s0, s1, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -623,55 +877,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 7 +; GFX7-NEXT: v_mov_b32_e32 v4, 16 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v4 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[8:11], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s1, 4 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v4, s0, 16 +; GFX10-NEXT: buffer_load_ushort v0, v4, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_load_u16 v0, v4, off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx @@ -683,34 +944,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GFX8-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8 +; GFX8-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx: @@ -720,17 +981,17 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx: @@ -738,17 +999,18 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx: @@ -756,19 +1018,14 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[3:6], s32 +; GFX11-NEXT: scratch_load_u16 v0, v0, s32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx @@ -776,65 +1033,126 @@ } define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 %idx) { -; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 -; GCN-NEXT: v_mov_b32_e32 v5, s3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX9-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:28 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:20 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:28 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:20 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:28 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GFX7-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_load_ushort v0, v0, s[4:7], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 7, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_load_u16 v0, v4, off offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -678,12 +678,12 @@ ; GFX9-LABEL: v_fdiv_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 @@ -691,10 +691,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16: @@ -702,21 +701,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16: @@ -724,21 +723,21 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b ret <2 x half> %fdiv @@ -778,9 +777,8 @@ ; GFX9-NEXT: v_rcp_f16_e32 v2, v1 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_afn: @@ -790,8 +788,9 @@ ; GFX10-NEXT: v_rcp_f16_e32 v2, v1 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_afn: @@ -804,9 +803,9 @@ ; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b ret <2 x half> %fdiv @@ -909,12 +908,12 @@ ; GFX9-LABEL: v_fdiv_v2f16_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 @@ -922,10 +921,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_ulp25: @@ -933,21 +931,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_ulp25: @@ -955,21 +953,21 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -1069,21 +1067,20 @@ ; GFX9-LABEL: v_rcp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16: @@ -1091,40 +1088,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX11-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x ret <2 x half> %fdiv @@ -1224,21 +1220,20 @@ ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: @@ -1246,40 +1241,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_arcp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX11-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x ret <2 x half> %fdiv @@ -1314,9 +1308,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp_afn: @@ -1324,20 +1317,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_arcp_afn: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x half> , %x ret <2 x half> %fdiv @@ -1428,9 +1422,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_ulp25: @@ -1438,20 +1431,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f16_e32 v1, v0 -; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_rcp_v2f16_ulp25: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_rcp_f16_e32 v0, v0 -; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x, !fpmath !0 ret <2 x half> %fdiv @@ -1491,9 +1485,8 @@ ; GFX9-NEXT: v_rcp_f16_e32 v2, v1 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1503,8 +1496,9 @@ ; GFX10-NEXT: v_rcp_f16_e32 v2, v1 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25: @@ -1517,9 +1511,9 @@ ; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -1622,12 +1616,12 @@ ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 ; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 @@ -1635,10 +1629,9 @@ ; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1646,21 +1639,21 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 ; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25: @@ -1668,21 +1661,21 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv @@ -1722,9 +1715,8 @@ ; GFX9-NEXT: v_rcp_f16_e32 v2, v1 ; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: @@ -1734,8 +1726,9 @@ ; GFX10-NEXT: v_rcp_f16_e32 v2, v1 ; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: @@ -1748,9 +1741,9 @@ ; GFX11-NEXT: v_rcp_f16_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -230,10 +230,9 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_exp_f16_e32 v1, v1 -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_exp_f16_e32 v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16: @@ -248,40 +247,39 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 -; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -345,10 +343,9 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_exp_f16_e32 v1, v1 -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_exp_f16_e32 v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs: @@ -364,11 +361,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 -; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs: @@ -379,26 +377,25 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -464,9 +461,8 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_exp_f16_e32 v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_rhs: @@ -474,49 +470,48 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_log_f16_e32 v2, v0 -; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 -; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_rhs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -589,9 +584,8 @@ ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: v_exp_f16_e32 v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_exp_f16_e32 v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -608,11 +602,12 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_exp_f16_e32 v1, v1 -; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_exp_f16_e32 v1, v2 +; GFX10-NEXT: v_exp_f16_e32 v0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: @@ -622,29 +617,27 @@ ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_log_f16_e32 v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -8,69 +8,98 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i16_s_s: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s5, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i16_s_s: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s4, 0xffff -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s6 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_and_b32 s0, s5, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i16_s_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_and_b32 s1, s5, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s4, 0xffff -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s6 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_and_b32 s0, s5, 1 +; GFX7-NEXT: v_mov_b32_e32 v0, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_s_s: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s6 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s1, 4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s3 -; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -78,16 +107,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_and_b32 s1, s5, 1 -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_store_b16 v2, v0, off offset:4 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -100,84 +129,117 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s1, s1, s0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 ; GFX8-NEXT: s_and_b32 s0, s3, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshl_b32 s1, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 ; GFX7-NEXT: s_and_b32 s0, s3, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 0xffff -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshl_b32 s1, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_and_b32 s1, s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s0, 4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v1, v2, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 -; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v2i16_s_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_and_b32 s0, s3, 1 -; GFX11-NEXT: s_and_b32 s1, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, s0 -; GFX11-NEXT: s_not_b32 s1, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -190,64 +252,94 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i16_v_s: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_andn2_b32 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i16_v_s: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_and_b32 s1, s4, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_and_b32 s0, s4, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 +; GFX8-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i16_v_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_and_b32 s0, s4, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_v_s: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s1, 4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 -; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; @@ -255,15 +347,16 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 ; GFX11-NEXT: s_and_b32 s1, s4, 1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_store_b16 v2, v0, off offset:4 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_not1_b32 s0, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -276,87 +369,111 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v2i16_s_v: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i16_s_v: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s2, s4, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i16_s_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s1, s4, 0xffff -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_and_b32 s1, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: scratch_store_b32 off, v2, off offset:4 +; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:4 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -369,84 +486,110 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v2i16_v_v: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v2i16_v_v: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i16_v_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i16_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: scratch_store_b32 off, v2, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v0, off offset:4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -459,87 +602,109 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 0xffff -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v2i16_s_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX11-NEXT: s_and_b32 s0, s2, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -552,83 +717,109 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: s_and_b32 s0, s2, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 ; GFX8-NEXT: s_and_b32 s0, s2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 ; GFX7-NEXT: s_and_b32 s0, s2, 1 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v1 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_s: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v2i16_v_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, s0, v0 -; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_not_b32 s0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -641,84 +832,105 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v2i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v2i16_v_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -789,114 +1001,126 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_lshr_b32 s0, s3, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_not_b32 s1, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: s_and_b32 s2, s2, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 -; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_lshr_b32 s0, s3, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v2, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s0, s3, 1 -; GFX10-NEXT: s_and_b32 s1, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s1 -; GFX10-NEXT: s_not_b32 s2, s3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s3, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s0, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v4i16_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s0, s3, 1 -; GFX11-NEXT: s_and_b32 s1, s3, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX11-NEXT: s_lshl_b32 s1, s1, 4 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, s1 -; GFX11-NEXT: s_not_b32 s2, s3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 +; GFX11-NEXT: s_and_b32 s0, s3, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v4, v2, s2, s1 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_store_b16 v3, v2, off offset:8 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -909,125 +1133,126 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v4i16_v_s: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s3, s1, s0 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s3, s3, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX9-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v4i16_v_s: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_lshr_b32 s2, s4, 1 -; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cselect_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_lshl_b32 s4, s4, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s3, s3, s4 -; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_and_b32 s0, s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX8-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v4i16_v_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 1 -; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cselect_b32 s3, s1, s0 -; GFX7-NEXT: s_and_b32 s4, s4, 1 -; GFX7-NEXT: s_lshl_b32 s4, s4, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_and_b32 s0, s4, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX7-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_v_s: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s2, s4, 3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s2, 8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s3, s1, s0 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_lshl_b32 s4, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX10-NEXT: s_andn2_b32 s3, s3, s5 -; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX10-NEXT: buffer_store_short v0, v3, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v4i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s2, s4, 1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-NEXT: s_cmp_eq_u32 s2, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX11-NEXT: s_and_b32 s2, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s3, s1, s0 -; GFX11-NEXT: s_and_b32 s4, s4, 1 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 -; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b64 off, v[1:2], off offset:8 +; GFX11-NEXT: scratch_store_b16 v3, v0, off offset:8 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1040,134 +1265,125 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v4i16_s_v: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s3, s4, 0xffff -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v4i16_s_v: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s3, s4, 0xffff -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v4, v0, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v4i16_s_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: s_and_b32 s2, s4, 0xffff -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:12 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_s_v: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX10-NEXT: s_and_b32 s2, s4, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:12 +; GFX10-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v4i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX11-NEXT: s_and_b32 s2, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v2, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_store_b16 v2, v3, off offset:8 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1180,132 +1396,125 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v4i16_v_v: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_u32_e32 v1, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:12 +; GFX9-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v4i16_v_v: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:12 +; GFX8-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v4i16_v_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 8, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:12 +; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_v_v: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 8, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v4i16_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 4, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_and_b32 v3, 3, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: scratch_store_b64 off, v[1:2], off offset:8 +; GFX11-NEXT: scratch_store_b16 v3, v0, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1319,117 +1528,118 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 0xffff -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX10-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_store_short v3, v2, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v4i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX11-NEXT: s_and_b32 s0, s2, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX11-NEXT: v_and_b32_e32 v3, 1, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_and_b32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_store_b16 v2, v3, off offset:8 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1443,112 +1653,118 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s1, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_lshr_b32 s0, s2, 1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_mov_b32_e32 v3, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, s0, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s1, s2, 1 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v3, s0, 8 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v4i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s1, s2, 1 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_cndmask_b32 v3, v0, v1 :: v_dual_lshlrev_b32 v2, s0, v2 -; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_not_b32 s0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_store_b16 v3, v2, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1562,115 +1778,114 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s2 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX7-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s2 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX7-NEXT: v_add_i32_e32 v3, vcc, 8, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:12 +; GFX10-NEXT: buffer_store_short v2, v3, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:12 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v4i16_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX11-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX11-NEXT: scratch_store_b16 v3, v2, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1683,177 +1898,156 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v8i16_s_s: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s6, s5, 1 -; GFX9-NEXT: s_cmp_eq_u32 s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s7, s1, s0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 2 -; GFX9-NEXT: s_cselect_b32 s7, s2, s7 -; GFX9-NEXT: s_cmp_eq_u32 s6, 3 -; GFX9-NEXT: s_cselect_b32 s7, s3, s7 -; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s5 -; GFX9-NEXT: s_andn2_b32 s5, s7, s5 -; GFX9-NEXT: s_or_b32 s4, s5, s4 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 -; GFX9-NEXT: s_cselect_b32 s0, s4, s0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 1 -; GFX9-NEXT: s_cselect_b32 s1, s4, s1 -; GFX9-NEXT: s_cmp_eq_u32 s6, 2 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_cmp_eq_u32 s6, 3 -; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s0, s5, 7 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i16_s_s: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_lshr_b32 s6, s5, 1 -; GFX8-NEXT: s_cmp_eq_u32 s6, 1 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cselect_b32 s7, s1, s0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 2 -; GFX8-NEXT: s_cselect_b32 s7, s2, s7 -; GFX8-NEXT: s_cmp_eq_u32 s6, 3 -; GFX8-NEXT: s_cselect_b32 s7, s3, s7 -; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: s_lshl_b32 s5, s5, 4 -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s5 -; GFX8-NEXT: s_andn2_b32 s5, s7, s5 -; GFX8-NEXT: s_or_b32 s4, s5, s4 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: s_cselect_b32 s0, s4, s0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 1 -; GFX8-NEXT: s_cselect_b32 s1, s4, s1 -; GFX8-NEXT: s_cmp_eq_u32 s6, 2 -; GFX8-NEXT: s_cselect_b32 s2, s4, s2 -; GFX8-NEXT: s_cmp_eq_u32 s6, 3 -; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s6 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_and_b32 s0, s5, 7 +; GFX8-NEXT: v_mov_b32_e32 v0, 16 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i16_s_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_lshr_b32 s6, s5, 1 -; GFX7-NEXT: s_cmp_eq_u32 s6, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cselect_b32 s7, s1, s0 -; GFX7-NEXT: s_cmp_eq_u32 s6, 2 -; GFX7-NEXT: s_cselect_b32 s7, s2, s7 -; GFX7-NEXT: s_cmp_eq_u32 s6, 3 -; GFX7-NEXT: s_cselect_b32 s7, s3, s7 -; GFX7-NEXT: s_and_b32 s5, s5, 1 -; GFX7-NEXT: s_lshl_b32 s5, s5, 4 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff -; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s5 -; GFX7-NEXT: s_andn2_b32 s5, s7, s5 -; GFX7-NEXT: s_or_b32 s4, s5, s4 -; GFX7-NEXT: s_cmp_eq_u32 s6, 0 -; GFX7-NEXT: s_cselect_b32 s0, s4, s0 -; GFX7-NEXT: s_cmp_eq_u32 s6, 1 -; GFX7-NEXT: s_cselect_b32 s1, s4, s1 -; GFX7-NEXT: s_cmp_eq_u32 s6, 2 -; GFX7-NEXT: s_cselect_b32 s2, s4, s2 -; GFX7-NEXT: s_cmp_eq_u32 s6, 3 -; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s6 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_and_b32 s0, s5, 7 +; GFX7-NEXT: v_mov_b32_e32 v0, 16 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v8i16_s_s: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s6, s5, 1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s6 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s5, s5, 7 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s5, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v5, s4, 16 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:28 +; GFX10-NEXT: buffer_store_short v0, v5, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s7, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 2 -; GFX10-NEXT: s_cselect_b32 s7, s2, s7 -; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s7, s3, s7 -; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_lshl_b32 s5, s5, 4 -; GFX10-NEXT: s_lshl_b32 s8, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s7, s8 -; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_cmp_eq_u32 s6, 2 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v8i16_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s6, s5, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_and_b32 s5, s5, 7 +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: s_lshl_b32 s4, s5, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s7, s1, s0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 2 -; GFX11-NEXT: s_cselect_b32 s7, s2, s7 -; GFX11-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-NEXT: s_cselect_b32 s7, s3, s7 -; GFX11-NEXT: s_and_b32 s5, s5, 1 -; GFX11-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-NEXT: s_lshl_b32 s5, s5, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s8, 0xffff, s5 -; GFX11-NEXT: s_lshl_b32 s4, s4, s5 -; GFX11-NEXT: s_and_not1_b32 s5, s7, s8 -; GFX11-NEXT: s_or_b32 s4, s5, s4 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 -; GFX11-NEXT: s_cselect_b32 s0, s4, s0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 1 -; GFX11-NEXT: s_cselect_b32 s1, s4, s1 -; GFX11-NEXT: s_cmp_eq_u32 s6, 2 -; GFX11-NEXT: s_cselect_b32 s2, s4, s2 -; GFX11-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-NEXT: s_cselect_b32 s3, s4, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v0, s0 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_store_b16 v5, v4, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1867,148 +2061,143 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_lshr_b32 s4, s3, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshl_b32 s1, s1, s0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_not_b32 s5, s0 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s3, 7 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX9-NEXT: buffer_store_short v5, v4, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s3, 1 -; GFX8-NEXT: s_lshr_b32 s4, s3, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshl_b32 s5, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s6, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s3, 7 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX8-NEXT: buffer_store_short v5, v4, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v6, s6, v6 -; GFX8-NEXT: v_or_b32_e32 v6, s5, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_s_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s3, 1 -; GFX7-NEXT: s_lshr_b32 s4, s3, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 0xffff -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s4 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s3, 7 +; GFX7-NEXT: v_mov_b32_e32 v4, 16 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: buffer_store_short v5, v4, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v8i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s4, s3, 1 -; GFX10-NEXT: s_and_b32 s1, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 -; GFX10-NEXT: s_lshl_b32 s3, s1, 4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 -; GFX10-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s3, 7 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v5, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_store_short v4, v5, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s4, s3, 1 -; GFX11-NEXT: s_and_b32 s1, s3, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 -; GFX11-NEXT: s_lshl_b32 s3, s1, 4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s3 -; GFX11-NEXT: s_lshl_b32 s2, s2, s3 -; GFX11-NEXT: s_not_b32 s3, s5 +; GFX11-NEXT: s_and_b32 s0, s3, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_or_b32 v6, v4, s3, s2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_store_b16 v5, v4, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2021,176 +2210,152 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v8i16_v_s: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s5, s4, 1 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s6, s1, s0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 -; GFX9-NEXT: s_cselect_b32 s6, s2, s6 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 -; GFX9-NEXT: s_cselect_b32 s6, s3, s6 -; GFX9-NEXT: s_and_b32 s4, s4, 1 -; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_and_b32 s0, s4, 7 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:28 +; GFX9-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i16_v_s: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_lshr_b32 s5, s4, 1 -; GFX8-NEXT: s_cmp_eq_u32 s5, 1 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_cselect_b32 s6, s1, s0 -; GFX8-NEXT: s_cmp_eq_u32 s5, 2 -; GFX8-NEXT: s_cselect_b32 s6, s2, s6 -; GFX8-NEXT: s_cmp_eq_u32 s5, 3 -; GFX8-NEXT: s_cselect_b32 s6, s3, s6 -; GFX8-NEXT: s_and_b32 s4, s4, 1 -; GFX8-NEXT: s_lshl_b32 s4, s4, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s4, s6, s4 -; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_and_b32 s0, s4, 7 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:28 +; GFX8-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i16_v_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_lshr_b32 s5, s4, 1 -; GFX7-NEXT: s_cmp_eq_u32 s5, 1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_cselect_b32 s6, s1, s0 -; GFX7-NEXT: s_cmp_eq_u32 s5, 2 -; GFX7-NEXT: s_cselect_b32 s6, s2, s6 -; GFX7-NEXT: s_cmp_eq_u32 s5, 3 -; GFX7-NEXT: s_cselect_b32 s6, s3, s6 -; GFX7-NEXT: s_and_b32 s4, s4, 1 -; GFX7-NEXT: s_lshl_b32 s4, s4, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 -; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_and_b32 s0, s4, 7 +; GFX7-NEXT: v_mov_b32_e32 v1, 16 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:28 +; GFX7-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v8i16_v_s: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s4, s4, 7 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v5, s4, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cselect_b32 s6, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s5, 2 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_cselect_b32 s6, s2, s6 -; GFX10-NEXT: s_cmp_eq_u32 s5, 3 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_cselect_b32 s6, s3, s6 -; GFX10-NEXT: s_and_b32 s4, s4, 1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: s_lshl_b32 s4, s4, 4 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, s7 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_mov_b32_e32 v4, s3 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:28 +; GFX10-NEXT: buffer_store_short v0, v5, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v8i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 -; GFX11-NEXT: s_lshr_b32 s5, s4, 1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX11-NEXT: s_and_b32 s4, s4, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cselect_b32 s6, s1, s0 -; GFX11-NEXT: s_cmp_eq_u32 s5, 2 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_cselect_b32 s6, s2, s6 -; GFX11-NEXT: s_cmp_eq_u32 s5, 3 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: s_cselect_b32 s6, s3, s6 -; GFX11-NEXT: s_and_b32 s4, s4, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshl_b32 s7, 0xffff, s4 -; GFX11-NEXT: s_and_not1_b32 s6, s6, s7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], off offset:16 +; GFX11-NEXT: scratch_store_b16 v5, v0, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2203,181 +2368,151 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v8i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i16_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:20 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v8i16_s_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v0 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:28 +; GFX10-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v8i16_s_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 -; GFX11-NEXT: s_and_b32 s1, s4, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v4, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 +; GFX11-NEXT: scratch_store_b16 v4, v5, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2390,181 +2525,151 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v8i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_u32_e32 v1, 16, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28 +; GFX9-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i16_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v6, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28 +; GFX8-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v8i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mov_b32_e32 v6, s7 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28 +; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 7, v1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 -; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v8i16_v_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v2, 4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 -; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v3, s7 -; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_and_b32 v5, 7, v1 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], off offset:16 +; GFX11-NEXT: scratch_store_b16 v5, v0, off offset:16 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2578,150 +2683,137 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX9-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX8-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 0xffff -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: s_and_b32 s1, s2, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX11-NEXT: s_and_b32 s1, s2, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b128 off, v[3:6], off offset:16 +; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v9, v2, v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v9 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr @@ -2734,146 +2826,137 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_and_b32 s0, s2, 1 -; GFX9-NEXT: s_lshr_b32 s4, s2, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_not_b32 s5, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 7 +; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX9-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s0, s2, 1 -; GFX8-NEXT: s_lshr_b32 s4, s2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_not_b32 s5, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 7 +; GFX8-NEXT: v_mov_b32_e32 v0, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX8-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3] -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_v_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s0, s2, 1 -; GFX7-NEXT: s_lshr_b32 s4, s2, 1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX7-NEXT: s_mov_b32 s10, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX7-NEXT: s_endpgm -; +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 7 +; GFX7-NEXT: v_mov_b32_e32 v0, 16 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:28 +; GFX7-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; ; GFX10-LABEL: insertelement_v_v8i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s3, s2, 1 -; GFX10-NEXT: s_and_b32 s1, s2, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 -; GFX10-NEXT: s_lshl_b32 s2, s1, 4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: s_not_b32 s2, s2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off -; GFX11-NEXT: s_lshr_b32 s3, s2, 1 -; GFX11-NEXT: s_and_b32 s1, s2, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 -; GFX11-NEXT: s_lshl_b32 s2, s1, 4 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v1, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_and_b32 s0, s2, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v9, v0, s2, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 -; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: scratch_store_b128 off, v[3:6], off offset:16 +; GFX11-NEXT: scratch_store_b16 v0, v2, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr @@ -2886,147 +2969,133 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: v_and_b32_e32 v0, 7, v3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:28 +; GFX9-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i16_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s2 +; GFX8-NEXT: v_and_b32_e32 v0, 7, v3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:28 +; GFX8-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v8i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s10, 0 -; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_mov_b64 s[8:9], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v0, 7, v3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16 +; GFX7-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:20 +; GFX7-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 +; GFX7-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:28 +; GFX7-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX7-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 7, v3 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 -; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:28 +; GFX10-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v8i16_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v0, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 -; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v3, v3, v2, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 -; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: scratch_store_b128 off, v[4:7], off offset:16 +; GFX11-NEXT: scratch_store_b16 v0, v2, off offset:16 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr @@ -3039,176 +3108,226 @@ ; GFX9-LABEL: insertelement_s_v16i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s7, s5, 1 -; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 32 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s0, s9, s8 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: s_cselect_b32 s0, s11, s0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: s_cselect_b32 s0, s12, s0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: s_cselect_b32 s0, s13, s0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: s_cselect_b32 s0, s14, s0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: s_cselect_b32 s0, s15, s0 -; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s4, 0xffff -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s16, s0, s2 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: s_cselect_b32 s0, s16, s8 -; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: s_cselect_b32 s1, s16, s9 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: s_cselect_b32 s2, s16, s10 -; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: s_cselect_b32 s3, s16, s11 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: s_cselect_b32 s4, s16, s12 -; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: s_cselect_b32 s5, s16, s13 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_cselect_b32 s6, s16, s14 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_cselect_b32 s7, s16, s15 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: v_mov_b32_e32 v4, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: s_add_u32 s16, s16, s6 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_and_b32 s0, s5, 15 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:60 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_lshr_b32 m0, s5, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_movrels_b32 s0, s8 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_movreld_b32 s8, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 +; GFX8-NEXT: s_add_u32 s16, s16, s6 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:32 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:36 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:40 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:44 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:48 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:52 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: s_and_b32 s0, s5, 15 +; GFX8-NEXT: v_mov_b32_e32 v0, 32 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:56 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:60 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX7-NEXT: s_and_b32 s1, s5, 1 -; GFX7-NEXT: s_lshr_b32 m0, s5, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_movrels_b32 s0, s8 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_movreld_b32 s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s19, 0xe8f000 +; GFX7-NEXT: s_add_u32 s16, s16, s6 +; GFX7-NEXT: s_addc_u32 s17, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:32 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:36 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:40 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:44 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:48 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:52 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: s_and_b32 s0, s5, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, 32 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:56 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:60 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v16i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: s_and_b32 s0, s5, 1 -; GFX10-NEXT: s_lshr_b32 m0, s5, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s18, -1 +; GFX10-NEXT: s_mov_b32 s19, 0x31c16000 +; GFX10-NEXT: s_add_u32 s16, s16, s6 +; GFX10-NEXT: s_addc_u32 s17, s17, 0 +; GFX10-NEXT: s_and_b32 s0, s5, 15 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_movrels_b32 s3, s8 -; GFX10-NEXT: s_andn2_b32 s1, s3, s2 -; GFX10-NEXT: s_or_b32 s0, s1, s0 -; GFX10-NEXT: s_movreld_b32 s8, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-NEXT: v_mov_b32_e32 v6, s12 +; GFX10-NEXT: v_mov_b32_e32 v7, s13 +; GFX10-NEXT: v_mov_b32_e32 v8, s14 +; GFX10-NEXT: v_mov_b32_e32 v9, s15 +; GFX10-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:60 +; GFX10-NEXT: buffer_store_short v0, v1, s[16:19], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: s_and_b32 s0, s5, 1 -; GFX11-NEXT: s_lshr_b32 m0, s5, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_and_b32 s1, s4, 0xffff -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_movrels_b32 s3, s8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_b32 s1, s3, s2 -; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s0, s5, 15 +; GFX11-NEXT: v_mov_b32_e32 v8, s4 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_movreld_b32 s8, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v10, 16 :: v_dual_mov_b32 v9, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s8 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], off offset:48 +; GFX11-NEXT: scratch_store_b16 v9, v8, off offset:32 ; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3223,97 +3342,138 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_lshr_b32 s12, s3, 1 -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshl_b32 s1, s1, s0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: s_not_b32 s13, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s3, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, 32 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:60 +; GFX9-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, 16 -; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-NEXT: s_and_b32 s0, s3, 1 -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 m0, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: v_mov_b32_e32 v0, 32 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_movrels_b32_e32 v12, v0 -; GFX8-NEXT: v_and_b32_e32 v12, s0, v12 -; GFX8-NEXT: v_or_b32_e32 v12, s1, v12 -; GFX8-NEXT: v_movreld_b32_e32 v0, v12 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:44 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:60 +; GFX8-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_s_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s0, s3, 1 -; GFX7-NEXT: s_and_b32 s1, s2, 0xffff -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s3, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, 32 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_movrels_b32_e32 v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_movreld_b32_e32 v2, v0 -; GFX7-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:60 +; GFX7-NEXT: buffer_store_short v1, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_s_s: @@ -3321,23 +3481,45 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_lshr_b32 m0, s3, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s1, s2, 0xffff -; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 -; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s3, 15 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 32 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:44 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_movrels_b32_e32 v0, v2 -; GFX10-NEXT: v_and_or_b32 v12, v0, s1, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_movreld_b32_e32 v2, v12 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX10-NEXT: global_store_dwordx4 v[10:11], v[6:9], off +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:60 +; GFX10-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_s_s: @@ -3345,25 +3527,25 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off ; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 -; GFX11-NEXT: s_and_b32 s0, s3, 1 -; GFX11-NEXT: s_lshr_b32 m0, s3, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_and_b32 s1, s2, 0xffff -; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 -; GFX11-NEXT: s_lshl_b32 s0, s1, s0 -; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: s_and_b32 s0, s3, 15 ; GFX11-NEXT: v_mov_b32_e32 v10, 16 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[2:5], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_movrels_b32_e32 v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v12, v0, s1, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_movreld_b32_e32 v2, v12 +; GFX11-NEXT: scratch_store_b128 off, v[6:9], off offset:48 +; GFX11-NEXT: scratch_store_b16 v1, v0, off offset:32 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off -; GFX11-NEXT: global_store_b128 v[10:11], v[6:9], off +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v11, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr @@ -3376,174 +3558,221 @@ ; GFX9-LABEL: insertelement_s_v16i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX9-NEXT: s_lshr_b32 s2, s4, 1 -; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 32 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cselect_b32 s0, s9, s8 -; GFX9-NEXT: s_cmp_eq_u32 s2, 2 -; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 3 -; GFX9-NEXT: s_cselect_b32 s0, s11, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 4 -; GFX9-NEXT: s_cselect_b32 s0, s12, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 5 -; GFX9-NEXT: s_cselect_b32 s0, s13, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 6 -; GFX9-NEXT: s_cselect_b32 s0, s14, s0 -; GFX9-NEXT: s_cmp_eq_u32 s2, 7 -; GFX9-NEXT: s_cselect_b32 s0, s15, s0 -; GFX9-NEXT: s_and_b32 s1, s4, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s0, s0, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: s_add_u32 s16, s16, s5 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:40 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:44 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:48 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: s_and_b32 s0, s4, 15 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:52 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:56 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:60 +; GFX9-NEXT: buffer_store_short v0, v1, s[16:19], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX8-NEXT: s_and_b32 s1, s4, 1 -; GFX8-NEXT: s_lshr_b32 m0, s4, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_movrels_b32 s0, s8 -; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 +; GFX8-NEXT: s_add_u32 s16, s16, s5 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 ; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_movreld_b32_e32 v0, v8 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:40 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:44 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:48 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_and_b32 s0, s4, 15 +; GFX8-NEXT: v_mov_b32_e32 v1, 32 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:52 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:56 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:60 +; GFX8-NEXT: buffer_store_short v0, v1, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_lshr_b32 m0, s4, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_movrels_b32 s0, s8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 -; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s19, 0xe8f000 +; GFX7-NEXT: s_add_u32 s16, s16, s5 +; GFX7-NEXT: s_addc_u32 s17, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-NEXT: v_mov_b32_e32 v6, s14 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:40 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:44 +; GFX7-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:48 +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_and_b32 s0, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, 32 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:52 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:56 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:60 +; GFX7-NEXT: buffer_store_short v0, v1, s[16:19], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_movreld_b32_e32 v0, v8 +; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v16i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: s_and_b32 s0, s4, 1 -; GFX10-NEXT: s_lshr_b32 m0, s4, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX10-NEXT: s_lshl_b32 s1, 0xffff, s0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_movrels_b32 s2, s8 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_lshl_or_b32 v12, v8, s0, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 +; GFX10-NEXT: s_mov_b32 s18, -1 +; GFX10-NEXT: s_mov_b32 s19, 0x31c16000 +; GFX10-NEXT: s_add_u32 s16, s16, s5 +; GFX10-NEXT: s_addc_u32 s17, s17, 0 +; GFX10-NEXT: s_and_b32 s0, s4, 15 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 32 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-NEXT: v_mov_b32_e32 v6, s12 +; GFX10-NEXT: v_mov_b32_e32 v7, s13 +; GFX10-NEXT: v_mov_b32_e32 v8, s14 +; GFX10-NEXT: v_mov_b32_e32 v9, s15 +; GFX10-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:60 +; GFX10-NEXT: buffer_store_short v0, v1, s[16:19], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_movreld_b32_e32 v0, v12 +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: s_and_b32 s0, s4, 1 -; GFX11-NEXT: s_lshr_b32 m0, s4, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GFX11-NEXT: s_lshl_b32 s1, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s0, s4, 15 ; GFX11-NEXT: v_mov_b32_e32 v10, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, s0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_movrels_b32 s2, s8 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: s_and_not1_b32 s1, s2, s1 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1 -; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 +; GFX11-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 +; GFX11-NEXT: v_dual_mov_b32 v5, s12 :: v_dual_mov_b32 v6, s13 +; GFX11-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v8, s15 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b16 v9, v0, off offset:32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_movreld_b32_e32 v0, v12 -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3556,288 +3785,224 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v3, s18 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s19 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s20 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s21 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s22 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v9, s23 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v6, s22 -; GFX9-NEXT: v_mov_b32_e32 v7, s23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX9-NEXT: s_mov_b32 s18, -1 +; GFX9-NEXT: s_mov_b32 s19, 0xe00000 +; GFX9-NEXT: s_add_u32 s16, s16, s5 +; GFX9-NEXT: s_addc_u32 s17, s17, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:40 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:44 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:48 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:52 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:56 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:60 +; GFX9-NEXT: v_add_u32_e32 v0, 32, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: v_mov_b32_e32 v3, s18 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s19 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s20 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s21 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_mov_b32_e32 v7, s22 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff -; GFX8-NEXT: s_and_b32 s4, s4, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v9, s23 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 -; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 -; GFX8-NEXT: v_mov_b32_e32 v5, s21 -; GFX8-NEXT: v_mov_b32_e32 v6, s22 -; GFX8-NEXT: v_mov_b32_e32 v7, s23 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX8-NEXT: s_mov_b32 s18, -1 +; GFX8-NEXT: s_mov_b32 s19, 0xe80000 +; GFX8-NEXT: s_add_u32 s16, s16, s5 +; GFX8-NEXT: s_addc_u32 s17, s17, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:40 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:44 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:48 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 32 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:52 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:56 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:60 +; GFX8-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v4, s19 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v5, s20 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 -; GFX7-NEXT: v_mov_b32_e32 v6, s21 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: v_mov_b32_e32 v7, s22 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, 0xffff -; GFX7-NEXT: v_mov_b32_e32 v9, s23 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] -; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 -; GFX7-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-NEXT: v_mov_b32_e32 v6, s22 -; GFX7-NEXT: v_mov_b32_e32 v7, s23 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s19, 0xe8f000 +; GFX7-NEXT: s_add_u32 s16, s16, s5 +; GFX7-NEXT: s_addc_u32 s17, s17, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:36 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:40 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:44 +; GFX7-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:48 +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 32 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:52 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:56 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:60 +; GFX7-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v16i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_and_b32 s5, s4, 0xffff -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_mov_b32 s18, -1 +; GFX10-NEXT: s_mov_b32 s19, 0x31c16000 +; GFX10-NEXT: s_add_u32 s16, s16, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s17, s17, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, s9 +; GFX10-NEXT: v_mov_b32_e32 v4, s10 +; GFX10-NEXT: v_mov_b32_e32 v5, s11 +; GFX10-NEXT: v_mov_b32_e32 v6, s12 +; GFX10-NEXT: v_mov_b32_e32 v7, s13 +; GFX10-NEXT: v_mov_b32_e32 v8, s14 +; GFX10-NEXT: v_mov_b32_e32 v9, s15 +; GFX10-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:60 +; GFX10-NEXT: buffer_store_short v1, v0, s[16:19], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v0 -; GFX11-NEXT: s_and_b32 s5, s4, 0xffff -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v9, s4 :: v_dual_and_b32 v8, 15, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX11-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_lshlrev_b32 v8, 1, v8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s9 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 ; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], off offset:48 +; GFX11-NEXT: scratch_store_b16 v8, v9, off offset:32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3850,288 +4015,223 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_s_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX9-NEXT: v_mov_b32_e32 v7, s17 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mov_b32_e32 v9, s18 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: s_mov_b32 s20, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v10, s19 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v6, s18 -; GFX9-NEXT: v_mov_b32_e32 v7, s19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s4 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:40 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:44 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:48 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:52 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:56 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:60 +; GFX9-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX8-NEXT: v_mov_b32_e32 v6, s16 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX8-NEXT: v_mov_b32_e32 v7, s17 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_mov_b32_e32 v9, s18 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_mov_b32 s20, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v10, s19 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s17 -; GFX8-NEXT: v_mov_b32_e32 v6, s18 -; GFX8-NEXT: v_mov_b32_e32 v7, s19 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s4 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 32 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:40 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:44 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:48 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:52 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:56 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:60 +; GFX8-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_mov_b32_e32 v5, s15 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 -; GFX7-NEXT: v_mov_b32_e32 v7, s17 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: v_mov_b32_e32 v9, s18 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mov_b32_e32 v10, s19 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-NEXT: v_mov_b32_e32 v3, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: v_mov_b32_e32 v7, s19 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s4 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, 32 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:40 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:44 +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:48 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:52 +; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:56 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:60 +; GFX7-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: s_waitcnt vmcnt(4) ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX7-NEXT: s_mov_b64 s[0:1], 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v16i16_v_v: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX10-NEXT: s_mov_b32 null, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s9 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s4 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 32, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v6, s4 +; GFX10-NEXT: v_mov_b32_e32 v7, s5 +; GFX10-NEXT: v_mov_b32_e32 v8, s6 +; GFX10-NEXT: v_mov_b32_e32 v9, s7 +; GFX10-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:44 +; GFX10-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:60 +; GFX10-NEXT: buffer_store_short v0, v1, s[8:11], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v16i16_v_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff +; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v10, 16 :: v_dual_and_b32 v9, 15, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s9 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 -; GFX11-NEXT: v_xor_b32_e32 v9, -1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 -; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 -; GFX11-NEXT: v_mov_b32_e32 v7, s15 -; GFX11-NEXT: v_mov_b32_e32 v3, s11 -; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, s12 -; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v4, s3 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v8, s7 +; GFX11-NEXT: v_dual_mov_b32 v7, s6 :: v_dual_mov_b32 v6, s5 +; GFX11-NEXT: v_mov_b32_e32 v5, s4 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 1, v9 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 off, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b16 v9, v0, off offset:32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 -; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -4146,45 +4246,43 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_and_b32 s1, s2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX9-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; @@ -4194,94 +4292,89 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_and_b32 s1, s2, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 32 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX8-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, 0xf000 -; GFX7-NEXT: s_mov_b64 s[16:17], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 0xffff -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: v_and_b32_e32 v1, 15, v2 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 32 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 -; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 -; GFX7-NEXT: s_mov_b64 s[16:17], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:60 +; GFX7-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX7-NEXT: s_mov_b64 s[4:5], 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_s_v: @@ -4289,45 +4382,44 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: s_and_b32 s5, s2, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v13, 16 -; GFX10-NEXT: v_mov_b32_e32 v14, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2 +; GFX10-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX10-NEXT: buffer_store_short v1, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 -; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_s_v: @@ -4335,50 +4427,24 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 -; GFX11-NEXT: s_and_b32 s5, s2, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 15, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 +; GFX11-NEXT: scratch_store_b128 off, v[3:6], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v15, v1, v11, v2 -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 +; GFX11-NEXT: scratch_store_b128 off, v[7:10], off offset:48 +; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:32 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off -; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr @@ -4392,70 +4458,89 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s0, s2, 1 -; GFX9-NEXT: s_lshr_b32 s12, s2, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: s_not_b32 s13, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, 32 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] -; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX9-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] -; GFX8-NEXT: s_and_b32 s0, s2, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 m0, s2, 1 -; GFX8-NEXT: v_mov_b32_e32 v13, s0 -; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 16 -; GFX8-NEXT: v_mov_b32_e32 v12, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_movrels_b32_e32 v13, v3 -; GFX8-NEXT: v_and_b32_e32 v13, s0, v13 -; GFX8-NEXT: v_or_b32_e32 v2, v13, v2 -; GFX8-NEXT: v_movreld_b32_e32 v3, v2 -; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 15 +; GFX8-NEXT: v_mov_b32_e32 v0, 32 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX8-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_v_s: @@ -4465,22 +4550,41 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s0, s2, 1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: s_lshr_b32 m0, s2, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, 32 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_movrels_b32_e32 v1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_movreld_b32_e32 v3, v0 -; GFX7-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:60 +; GFX7-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[8:11], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[8:11], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[8:11], 0 offset:60 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX7-NEXT: s_mov_b64 s[4:5], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_v_s: @@ -4488,22 +4592,43 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 m0, s2, 1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_mov_b32_e32 v11, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 15 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:44 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_movrels_b32_e32 v1, v3 -; GFX10-NEXT: v_and_or_b32 v2, v1, s0, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_movreld_b32_e32 v3, v2 -; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off -; GFX10-NEXT: global_store_dwordx4 v[11:12], v[7:10], off +; GFX10-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:60 +; GFX10-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_v_s: @@ -4511,25 +4636,25 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off ; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX11-NEXT: s_and_b32 s0, s2, 1 -; GFX11-NEXT: s_lshr_b32 m0, s2, 1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v11, 16 :: v_dual_lshlrev_b32 v0, s0, v0 -; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_and_b32 s0, s2, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[3:6], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_movrels_b32_e32 v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v2, v1, s0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_movreld_b32_e32 v3, v2 +; GFX11-NEXT: scratch_store_b128 off, v[7:10], off offset:48 +; GFX11-NEXT: scratch_store_b16 v0, v2, off offset:32 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off -; GFX11-NEXT: global_store_b128 v[11:12], v[7:10], off +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr @@ -4543,44 +4668,42 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:60 +; GFX9-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX9-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX9-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX9-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, 16 ; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; @@ -4590,93 +4713,87 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s2 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 32 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX8-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:44 +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:60 +; GFX8-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX8-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX8-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX8-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX8-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: s_waitcnt vmcnt(4) ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s18, 0 -; GFX7-NEXT: s_mov_b32 s19, 0xf000 -; GFX7-NEXT: s_mov_b64 s[16:17], 0 -; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 -; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 -; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[0:3], 0 addr64 offset:16 +; GFX7-NEXT: v_and_b32_e32 v1, 15, v3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 32 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 -; GFX7-NEXT: s_mov_b64 s[16:17], 16 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:32 +; GFX7-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:36 +; GFX7-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:40 +; GFX7-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:44 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:48 +; GFX7-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:52 +; GFX7-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:56 +; GFX7-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:60 +; GFX7-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX7-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX7-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX7-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX7-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX7-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX7-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX7-NEXT: s_waitcnt vmcnt(4) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_mov_b64 s[0:1], 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v16i16_v_v: @@ -4684,44 +4801,43 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_mov_b32_e32 v14, 16 -; GFX10-NEXT: v_mov_b32_e32 v15, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v3 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 32, v0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX10-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:44 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 -; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 -; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off +; GFX10-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_store_dword v11, off, s[4:7], 0 offset:60 +; GFX10-NEXT: buffer_store_short v2, v0, s[4:7], 0 offen +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:32 +; GFX10-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:36 +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:40 +; GFX10-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:44 +; GFX10-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:48 +; GFX10-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:52 +; GFX10-NEXT: buffer_load_dword v6, off, s[4:7], 0 offset:56 +; GFX10-NEXT: buffer_load_dword v7, off, s[4:7], 0 offset:60 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_v_v16i16_v_v: @@ -4729,50 +4845,24 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX11-NEXT: v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3 -; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff -; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX11-NEXT: v_xor_b32_e32 v3, -1, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_mov_b32_e32 v13, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], off offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v16, v1, v3, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 +; GFX11-NEXT: scratch_store_b128 off, v[8:11], off offset:48 +; GFX11-NEXT: scratch_store_b16 v0, v2, off offset:32 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off -; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off +; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:32 +; GFX11-NEXT: scratch_load_b128 v[4:7], off, off offset:48 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -10,18 +10,30 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_and_b32 s0, s5, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -30,79 +42,123 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s6 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_and_b32 s0, s5, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_bfe_u32 v3, v0, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_s_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s6 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s5, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s6 ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s0, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_s_s: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s0, s5, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -117,73 +173,125 @@ ; GFX9-LABEL: insertelement_v_v2i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_u32 v3, v0, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i8_s_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s4 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s3, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -191,20 +299,27 @@ ; GFX11-LABEL: insertelement_v_v2i8_s_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 +; GFX11-NEXT: s_and_b32 s0, s3, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -220,17 +335,29 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_and_b32 s0, s4, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -239,36 +366,61 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: flat_load_ushort v1, v[1:2] -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_and_b32 s0, s4, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX8-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_v_s: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s4, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -277,34 +429,53 @@ ; GFX10-LABEL: insertelement_s_v2i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s5 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_and_b32 s0, s4, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_add_nc_u32_e64 v2, s0, 4 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v0, v2, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_v_s: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX11-NEXT: s_and_b32 s0, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_store_b16 v2, v0, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -324,18 +495,29 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s5 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -344,38 +526,61 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: flat_load_ushort v1, v[1:2] +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s11, 0xe80000 +; GFX8-NEXT: s_add_u32 s8, s8, s5 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_bfe_u32 v3, v1, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v2i8_s_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s5 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v0, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -384,35 +589,52 @@ ; GFX10-LABEL: insertelement_s_v2i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, s4 +; GFX10-NEXT: s_add_u32 s8, s8, s5 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 +; GFX10-NEXT: buffer_store_short v3, v0, s[8:11], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_s_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 +; GFX11-NEXT: scratch_store_b16 v0, v3, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 @@ -432,16 +654,28 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_load_ushort v2, v2, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_mov_b32 s8, s4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s8 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -451,14 +685,26 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_mov_b32 s8, s4 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s8 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_u32 v3, v2, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -467,20 +713,33 @@ ; ; GFX7-LABEL: insertelement_s_v2i8_v_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, s4 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s8 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -489,35 +748,52 @@ ; GFX10-LABEL: insertelement_s_v2i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_mov_b32 s8, s4 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3] +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s8 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v0, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: insertelement_s_v2i8_v_v: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 1, v1 ; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX11-NEXT: scratch_store_b32 off, v2, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v0, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) @@ -535,36 +811,58 @@ ; GFX9-LABEL: insertelement_v_v2i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_bfe_u32 v3, v0, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -574,34 +872,59 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v3, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v3, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -609,17 +932,22 @@ ; GFX11-LABEL: insertelement_v_v2i8_s_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_and_b32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v3, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) @@ -637,34 +965,58 @@ ; GFX9-LABEL: insertelement_v_v2i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: s_add_u32 s4, s4, s3 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: s_and_b32 s0, s2, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_bfe_u32 v3, v0, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -674,33 +1026,57 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[8:11], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s3 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 s0, s2, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_add_nc_u32_e64 v1, s0, 4 +; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -708,15 +1084,21 @@ ; GFX11-LABEL: insertelement_v_v2i8_v_s: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -736,70 +1118,115 @@ ; GFX9-LABEL: insertelement_v_v2i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s7, 0xe00000 +; GFX9-NEXT: s_add_u32 s4, s4, s2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX9-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v2i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_mov_b32 s7, 0xe80000 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: s_add_u32 s4, s4, s2 +; GFX8-NEXT: s_addc_u32 s5, s5, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_u32 v3, v0, 8, 16 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX8-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX8-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0 +; GFX8-NEXT: v_and_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i8_v_v: ; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 16 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX7-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; GFX10-NEXT: s_add_u32 s4, s4, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: s_addc_u32 s5, s5, 0 ; GFX10-NEXT: s_movk_i32 s0, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 +; GFX10-NEXT: buffer_store_short v2, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v3, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -807,15 +1234,20 @@ ; GFX11-LABEL: insertelement_v_v2i8_v_v: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v3 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:4 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir @@ -61,15 +61,15 @@ ; GFX9-LABEL: name: fmul_v2f16_fneg_lo_v_v ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-NEXT: [[FNEG:%[0-9]+]]:vgpr(s16) = G_FNEG [[TRUNC]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[FNEG]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:vgpr_32(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[COPY2]](s32) - ; GFX9-NEXT: %7:vgpr_32(<2 x s16>) = nofpexcept V_PK_MUL_F16 8, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 8, [[COPY]](<2 x s16>), 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX9-NEXT: S_ENDPGM 0, implicit %7(<2 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX9-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY1]], implicit $exec + ; GFX9-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[V_XOR_B32_e64_]], implicit $exec + ; GFX9-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY2]], 16, [[V_AND_B32_e32_]], implicit $exec + ; GFX9-NEXT: %7:vgpr_32 = nofpexcept V_PK_MUL_F16 8, [[V_LSHL_OR_B32_e64_]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit %7 %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir @@ -236,22 +236,27 @@ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ADD]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[ADD1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir @@ -792,10 +792,8 @@ ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[TRUNC2]](s16) ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) ; GFX9PLUS-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC1]], [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[ASHR]](s16), [[ASHR1]](s16) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s32>) = COPY $vgpr0_vgpr1 %2:_(<2 x s16>) = G_ASHR %0, %1 @@ -906,31 +904,39 @@ ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9PLUS-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9PLUS-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9PLUS-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9PLUS-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC2]], [[TRUNC5]](s16) ; GFX9PLUS-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9PLUS-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9PLUS-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9PLUS-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[BITCAST5]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST6]](s32) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[ASHR1]](s16), [[TRUNC8]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.s16.mir @@ -29,8 +29,10 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -76,10 +78,13 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<6 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -126,9 +131,13 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<4 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -193,12 +202,17 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<10 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -280,14 +294,21 @@ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<14 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -360,11 +381,19 @@ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<8 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -474,15 +503,31 @@ ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY6]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY7]](s32) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY8]](s32) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC12]](s16), [[TRUNC13]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC14]](s16), [[TRUNC15]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<16 x s16>) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -235,8 +235,7 @@ ; CHECK-LABEL: name: extract_vector_elt_0_v2i8_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) %0:_(<2 x s8>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 %2:_(s8) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -252,10 +251,8 @@ ; CHECK-LABEL: name: extract_vector_elt_0_v2i16_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32) %0:_(<2 x s16>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -272,8 +269,7 @@ ; CHECK-LABEL: name: extract_vector_elt_0_v2i1_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) %0:_(<2 x s1>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 %2:_(s1) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -289,9 +285,17 @@ ; CHECK-LABEL: name: extract_vector_elt_0_v2i1_i1 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[DEF]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: G_STORE [[TRUNC]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[C]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(<2 x s1>) = G_IMPLICIT_DEF %1:_(s1) = G_CONSTANT i1 false %2:_(s1) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -313,9 +317,22 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[LSHR]](s32) - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[COPY1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, addrspace 5) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND2]], [[C4]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -337,8 +354,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -363,8 +379,7 @@ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -393,9 +408,29 @@ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32) - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]] + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]] + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C3]] + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, align 8, addrspace 5) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32) + ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s16>), [[PTR_ADD]](p5) :: (store (<2 x s16>) into %stack.0 + 4, basealign 8, addrspace 5) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C6]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -417,8 +452,25 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<3 x s32>), [[COPY1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, align 8, addrspace 5) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; CHECK-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p5) :: (store (s16) into %stack.0 + 4, align 4, basealign 8, addrspace 5) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY1]], [[C3]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UMIN]], [[C3]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s32) = COPY $vgpr3 %2:_(<3 x s8>) = G_TRUNC %0 @@ -978,13 +1030,15 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: G_STORE [[COPY]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, addrspace 5) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[SHL]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[C1]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -1003,10 +1057,8 @@ ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -1036,28 +1088,6 @@ $vgpr0 = COPY %3 ... ---- -name: extract_vector_elt_v2s16_idx2_i32 - -body: | - bb.0: - liveins: $vgpr0 - - ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx2_i32 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) - %0:_(<2 x s16>) = COPY $vgpr0 - %1:_(s32) = G_CONSTANT i32 2 - %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 - %3:_(s32) = G_ANYEXT %2 - $vgpr0 = COPY %3 -... - --- name: extract_vector_elt_v3s16_varidx_i32 @@ -1070,8 +1100,25 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<3 x s32>), [[COPY1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, align 8, addrspace 5) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; CHECK-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p5) :: (store (s16) into %stack.0 + 4, align 4, basealign 8, addrspace 5) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY1]], [[C3]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UMIN]], [[C3]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s32) = COPY $vgpr3 %2:_(<3 x s16>) = G_TRUNC %0 @@ -1092,8 +1139,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s32) = G_CONSTANT i32 0 %2:_(<3 x s16>) = G_TRUNC %0 @@ -1114,8 +1160,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[UV1]](s32) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s32) = G_CONSTANT i32 1 %2:_(<3 x s16>) = G_TRUNC %0 @@ -1136,8 +1181,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(s32) = G_CONSTANT i32 2 %2:_(<3 x s16>) = G_TRUNC %0 @@ -1146,26 +1190,6 @@ $vgpr0 = COPY %4 ... ---- -name: extract_vector_elt_v3s16_idx3_i32 - -body: | - bb.0: - liveins: $vgpr0_vgpr1_vgpr2 - - ; CHECK-LABEL: name: extract_vector_elt_v3s16_idx3_i32 - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = COPY [[DEF]](s32) - %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - %1:_(s32) = G_CONSTANT i32 3 - %2:_(<3 x s16>) = G_TRUNC %0 - %3:_(s16) = G_EXTRACT_VECTOR_ELT %2, %1 - %4:_(s32) = G_ANYEXT %3 - $vgpr0 = COPY %4 -... - --- name: extract_vector_elt_v4s16_varidx_i32 @@ -1178,15 +1202,19 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<2 x s32>), [[LSHR]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[LSHR1]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; CHECK-NEXT: G_STORE [[UV]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, align 8, addrspace 5) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32) + ; CHECK-NEXT: G_STORE [[UV1]](<2 x s16>), [[PTR_ADD]](p5) :: (store (<2 x s16>) into %stack.0 + 4, basealign 8, addrspace 5) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[C2]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -1917,9 +1945,23 @@ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[LSHR]](s32), [[LSHR1]](s32) - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[COPY1]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32) + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s16>), [[FRAME_INDEX]](p5) :: (store (<2 x s16>) into %stack.0, align 8, addrspace 5) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C3]](s32) + ; CHECK-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD]](p5) :: (store (s16) into %stack.0 + 4, align 4, basealign 8, addrspace 5) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY1]], [[C4]] + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UMIN]], [[C4]] + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16), addrspace 5) + ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s24) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir @@ -329,21 +329,27 @@ ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF1]](s32) - ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF1]](s16) + ; GFX9-NEXT: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR]] + ; GFX9-NEXT: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[BUILD_VECTOR1]] ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FABS]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FABS1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<6 x s16>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FABS %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir @@ -498,32 +498,44 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(<2 x s16>) = G_FADD [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FADD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[FADD1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST7]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcanonicalize.mir @@ -321,19 +321,22 @@ ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF1]](s32) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF1]](s16) + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR1]] ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FCANONICALIZE]](<2 x s16>) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FCANONICALIZE1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR1]](s32), [[BITCAST3]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR2]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FCANONICALIZE %0 %2:_(<3 x s32>) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fceil.mir @@ -227,10 +227,8 @@ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[FCEIL:%[0-9]+]]:_(s16) = G_FCEIL [[TRUNC]] ; GFX9-NEXT: [[FCEIL1:%[0-9]+]]:_(s16) = G_FCEIL [[TRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FCEIL]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FCEIL1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FCEIL]](s16), [[FCEIL1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FCEIL %0 $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir @@ -550,12 +550,12 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BUILD_VECTOR_TRUNC]] + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BUILD_VECTOR]] ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]] ; GFX9-NEXT: $vgpr0 = COPY [[OR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcos.mir @@ -354,10 +354,8 @@ ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), [[FMUL]](s16) ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[C1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), [[FMUL1]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT]](s16), [[INT1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FCOS %0 $vgpr0 = COPY %1 @@ -567,13 +565,9 @@ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), [[FMUL2]](s16) ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[C1]] ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.cos), [[FMUL3]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT]](s16), [[INT1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_FCOS %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -1413,10 +1413,8 @@ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16 ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 ; GFX9-UNSAFE-NEXT: {{ $}} @@ -1435,10 +1433,8 @@ ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC3]](s16) ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL1]](s16) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16) + ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_fdiv_v2s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -1465,10 +1461,8 @@ ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32) ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_FDIV %0, %1 @@ -1930,13 +1924,9 @@ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[INT7]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16 ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -1970,13 +1960,9 @@ ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC2]], [[INT2]] ; GFX9-UNSAFE-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC7]](s16) ; GFX9-UNSAFE-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[INT3]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL1]](s16) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL3]](s16) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16) + ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16) + ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-LABEL: name: test_fdiv_v4s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -2026,13 +2012,9 @@ ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32) ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) - ; GFX10-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[INT7]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp.mir @@ -271,10 +271,8 @@ ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s16) = G_FEXP2 [[FMUL]] ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL %5, [[C]] ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s16) = G_FEXP2 [[FMUL1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FEXP2_]](s16), [[FEXP2_1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FEXP %1 $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fexp2.mir @@ -211,10 +211,8 @@ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[FEXP2_:%[0-9]+]]:_(s16) = G_FEXP2 [[TRUNC]] ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s16) = G_FEXP2 [[TRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FEXP2_]](s16), [[FEXP2_1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FEXP2 %0 $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ffloor.mir @@ -362,10 +362,8 @@ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[FFLOOR:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC]] ; GFX9-NEXT: [[FFLOOR1:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FFLOOR]](s16), [[FFLOOR1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FFLOOR %0 $vgpr0 = COPY %1 @@ -534,13 +532,9 @@ ; GFX9-NEXT: [[FFLOOR1:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC1]] ; GFX9-NEXT: [[FFLOOR2:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC2]] ; GFX9-NEXT: [[FFLOOR3:%[0-9]+]]:_(s16) = G_FFLOOR [[TRUNC3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FFLOOR3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FFLOOR]](s16), [[FFLOOR1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FFLOOR2]](s16), [[FFLOOR3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_FFLOOR %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir @@ -570,38 +570,53 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr6_vgpr7_vgpr8 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<6 x s16>) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC5]] + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR2]], [[BUILD_VECTOR4]] + ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(<2 x s16>) = G_FMA [[BUILD_VECTOR1]], [[BUILD_VECTOR3]], [[BUILD_VECTOR5]] ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[FMA]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[FMA1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV9:%[0-9]+]]:_(<2 x s16>), [[UV10:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV9]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV10]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST6]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST7]](s32), [[BITCAST8]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[BITCAST9]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>), [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC11]](s16), [[TRUNC12]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC13]](s16), [[TRUNC14]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>), [[BUILD_VECTOR8]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir @@ -172,10 +172,8 @@ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC4]] ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC3]] ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC5]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -364,13 +362,9 @@ ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[FMUL2]], [[TRUNC10]] ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[TRUNC7]] ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[FMUL3]], [[TRUNC11]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX10-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -618,10 +612,8 @@ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[FMUL]], [[TRUNC4]] ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[TRUNC3]] ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[FMUL1]], [[TRUNC5]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -733,10 +725,8 @@ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL]], [[TRUNC4]] ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC1]], [[TRUNC3]] ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL1]], [[TRUNC5]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -929,13 +919,9 @@ ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s16) = G_FADD [[FMUL2]], [[TRUNC10]] ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[TRUNC7]] ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[FMUL3]], [[TRUNC11]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX10-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -1130,13 +1116,9 @@ ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL2]], [[TRUNC10]] ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = nnan G_FMUL [[TRUNC3]], [[TRUNC7]] ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s16) = nnan G_FADD [[FMUL3]], [[TRUNC11]] - ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX10-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD3]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir @@ -565,36 +565,40 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] - ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC3]] + ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC1]] + ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC4]] + ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] + ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC2]] + ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC5]] + ; GFX9-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE1]](<2 x s16>) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST7]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE]](s16), [[FMAXNUM_IEEE1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE2]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 @@ -719,14 +723,39 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV2]] - ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV1]] - ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] - ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMAXNUM_IEEE]](<2 x s16>), [[FMAXNUM_IEEE1]](<2 x s16>) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC4]] + ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC1]] + ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC5]] + ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] + ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC2]] + ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC6]] + ; GFX9-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] + ; GFX9-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC3]] + ; GFX9-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC7]] + ; GFX9-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE]](s16), [[FMAXNUM_IEEE1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE2]](s16), [[FMAXNUM_IEEE3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -1045,10 +1074,9 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[COPY]] - ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[BUILD_VECTOR_TRUNC]] + ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[BUILD_VECTOR]] ; GFX9-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s16) = G_FCONSTANT half 0xH0000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir @@ -565,36 +565,40 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] - ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC3]] + ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC1]] + ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC4]] + ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] + ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC2]] + ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC5]] + ; GFX9-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE1]](<2 x s16>) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST7]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMINNUM_IEEE]](s16), [[FMINNUM_IEEE1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMINNUM_IEEE2]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 @@ -719,14 +723,39 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV2]] - ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV1]] - ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[UV3]] - ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FMINNUM_IEEE]](<2 x s16>), [[FMINNUM_IEEE1]](<2 x s16>) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC4]] + ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC1]] + ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC5]] + ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] + ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC2]] + ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC6]] + ; GFX9-NEXT: [[FMINNUM_IEEE2:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] + ; GFX9-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC3]] + ; GFX9-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC7]] + ; GFX9-NEXT: [[FMINNUM_IEEE3:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMINNUM_IEEE]](s16), [[FMINNUM_IEEE1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMINNUM_IEEE2]](s16), [[FMINNUM_IEEE3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -1045,10 +1074,9 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[COPY]] - ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[BUILD_VECTOR_TRUNC]] + ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[BUILD_VECTOR]] ; GFX9-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s16) = G_FCONSTANT half 0xH0000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir @@ -485,32 +485,44 @@ ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX9PLUS-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9PLUS-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9PLUS-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9PLUS-NEXT: [[FMUL1:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9PLUS-NEXT: [[FMUL1:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9PLUS-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[FMUL]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9PLUS-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9PLUS-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[FMUL1]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9PLUS-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; GFX9PLUS-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9PLUS-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9PLUS-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[BITCAST6]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST7]](s32) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9PLUS-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir @@ -309,14 +309,17 @@ ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF1]](s32) - ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF1]](s16) + ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR]] + ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[BUILD_VECTOR1]] ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG]](<2 x s16>) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>) @@ -324,8 +327,8 @@ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]] ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR2]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_FNEG %0 %2:_(<3 x s32>) = G_ZEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fpow.mir @@ -263,10 +263,8 @@ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FPEXT2]](s32), [[FPEXT3]](s32) ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s16) = G_FEXP2 [[FPTRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FEXP2_]](s16), [[FEXP2_1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_FPOW %0, %1 @@ -337,10 +335,8 @@ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan nsz G_INTRINSIC intrinsic(@llvm.amdgcn.fmul.legacy), [[FPEXT2]](s32), [[FPEXT3]](s32) ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT1]](s32) ; GFX9-NEXT: [[FEXP2_1:%[0-9]+]]:_(s16) = nnan nsz G_FEXP2 [[FPTRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FEXP2_1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FEXP2_]](s16), [[FEXP2_1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = nnan nsz G_FPOW %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir @@ -322,20 +322,17 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[AND]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]] ; GFX9-NEXT: $vgpr0 = COPY [[OR]](<2 x s16>) @@ -843,70 +840,74 @@ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[C1]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[C2]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC7]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[C3]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[AND]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR4]], [[BUILD_VECTOR6]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR4]], [[BUILD_VECTOR7]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR6]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[AND]](<2 x s16>) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR2]], [[BUILD_VECTOR8]](<2 x s16>) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR3]], [[AND1]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR4]] - ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC10]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[AND2]](<2 x s16>) - ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC11]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR5]], [[BUILD_VECTOR9]] + ; GFX9-NEXT: [[BUILD_VECTOR10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR5]], [[BUILD_VECTOR10]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR9]] + ; GFX9-NEXT: [[BUILD_VECTOR11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[AND2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR3]], [[BUILD_VECTOR11]](<2 x s16>) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR5]], [[AND3]](<2 x s16>) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR6]] ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) ; GFX9-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) ; GFX9-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST6]](s32), [[LSHR7]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST7]](s32), [[BITCAST8]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR8]](s32), [[BITCAST9]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC12]](<2 x s16>) - ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC13]](<2 x s16>) - ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC14]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC13]](s16), [[TRUNC14]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC11]](s16), [[TRUNC12]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR12]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR14]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR13]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -1118,36 +1119,27 @@ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[AND]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]] - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR3]] + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR4]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR3]] + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[AND2]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR2]], [[AND3]](<2 x s16>) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR3]] ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[OR]](<2 x s16>), [[OR1]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir @@ -353,19 +353,16 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>) ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[AND]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]] @@ -993,70 +990,74 @@ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[C1]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[C2]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC7]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC6]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[C3]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR4]], [[BUILD_VECTOR6]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR4]], [[BUILD_VECTOR7]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR6]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR8]](<2 x s16>) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[AND]](<2 x s16>) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR2]], [[AND]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR3]] - ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR_TRUNC5]], [[BUILD_VECTOR_TRUNC10]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC9]] - ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC11]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[BUILD_VECTOR5]], [[BUILD_VECTOR9]] + ; GFX9-NEXT: [[BUILD_VECTOR10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[BUILD_VECTOR5]], [[BUILD_VECTOR10]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR9]] + ; GFX9-NEXT: [[BUILD_VECTOR11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR11]](<2 x s16>) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL2]], [[AND3]](<2 x s16>) - ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[AND2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR3]], [[AND2]](<2 x s16>) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL3]], [[LSHR4]] ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) ; GFX9-NEXT: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST6]](s32), [[LSHR5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST7]](s32), [[BITCAST8]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR6]](s32), [[BITCAST9]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC12]](<2 x s16>) - ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC13]](<2 x s16>) - ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC14]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST9]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC13]](s16), [[TRUNC14]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC11]](s16), [[TRUNC12]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR12]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR14]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR13]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -1405,35 +1406,26 @@ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) - ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR]] + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>) ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[AND]](<2 x s16>) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]] - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR3]] + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR4]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR3]] + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL2]], [[AND3]](<2 x s16>) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[AND2]](<2 x s16>) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL3]], [[LSHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsin.mir @@ -354,10 +354,8 @@ ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), [[FMUL]](s16) ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[C1]] ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), [[FMUL1]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT]](s16), [[INT1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FSIN %0 $vgpr0 = COPY %1 @@ -567,13 +565,9 @@ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), [[FMUL2]](s16) ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[C1]] ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.sin), [[FMUL3]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT]](s16), [[INT1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_FSIN %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsqrt.mir @@ -274,10 +274,8 @@ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[FSQRT:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC]] ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSQRT]](s16), [[FSQRT1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_FSQRT %0 $vgpr0 = COPY %1 @@ -446,13 +444,9 @@ ; GFX9-NEXT: [[FSQRT1:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC1]] ; GFX9-NEXT: [[FSQRT2:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC2]] ; GFX9-NEXT: [[FSQRT3:%[0-9]+]]:_(s16) = G_FSQRT [[TRUNC3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FSQRT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSQRT]](s16), [[FSQRT1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSQRT2]](s16), [[FSQRT3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_FSQRT %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir @@ -568,15 +568,15 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB2]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[BITCAST4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB]](s16), [[FSUB1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB2]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 @@ -719,13 +719,9 @@ ; GFX9-NEXT: [[FSUB1:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC1]], [[TRUNC5]] ; GFX9-NEXT: [[FSUB2:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC2]], [[TRUNC6]] ; GFX9-NEXT: [[FSUB3:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC3]], [[TRUNC7]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB]](s16), [[FSUB1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSUB2]](s16), [[FSUB3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir @@ -166,7 +166,16 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>) - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[UV1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST1]](s32), [[LSHR]](s32) ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(s32) = COPY $vgpr0 %1:_(s8) = G_TRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir @@ -641,10 +641,8 @@ ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS1]](s16), [[C2]] ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[OR1]], [[C1]] ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC1]], [[SELECT1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_INTRINSIC_ROUND %0 $vgpr0 = COPY %1 @@ -857,15 +855,15 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 @@ -1099,13 +1097,9 @@ ; GFX9-NEXT: [[FCMP3:%[0-9]+]]:_(s1) = G_FCMP floatpred(oge), [[FABS3]](s16), [[C2]] ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[FCMP3]](s1), [[OR3]], [[C1]] ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s16) = G_FADD [[INTRINSIC_TRUNC3]], [[SELECT3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD]](s16), [[FADD1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FADD2]](s16), [[FADD3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_INTRINSIC_ROUND %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-trunc.mir @@ -220,10 +220,8 @@ ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[INTRINSIC_TRUNC:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC]] ; GFX9-NEXT: [[INTRINSIC_TRUNC1:%[0-9]+]]:_(s16) = G_INTRINSIC_TRUNC [[TRUNC1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INTRINSIC_TRUNC]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INTRINSIC_TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INTRINSIC_TRUNC]](s16), [[INTRINSIC_TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = G_INTRINSIC_TRUNC %0 $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: not --crash llc -march=amdgcn -run-pass=legalizer -o /dev/null %s 2>&1 | FileCheck %s # CHECK: LLVM ERROR: unable to legalize instruction: %3:_(p0) = G_JUMP_TABLE %jump-table.0 (in function: jt_test) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll @@ -609,9 +609,11 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_2d @@ -629,9 +631,11 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2d), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -656,12 +660,15 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -680,12 +687,15 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.3d), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -711,12 +721,15 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -735,12 +748,15 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.cube), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -766,9 +782,11 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_add_1darray @@ -786,9 +804,11 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1darray), [[COPY8]](s32), [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -813,12 +833,15 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -837,12 +860,15 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darray), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -868,12 +894,15 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -892,12 +921,15 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2dmsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -923,12 +955,16 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -947,12 +983,16 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.2darraymsaa), [[COPY8]](s32), [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1024,10 +1064,12 @@ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: atomic_cmpswap_2d @@ -1046,10 +1088,12 @@ ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -1075,13 +1119,16 @@ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1101,13 +1148,16 @@ ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.3d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1134,13 +1184,17 @@ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1160,13 +1214,17 @@ ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.2darraymsaa), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store (s32) on custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -18,11 +18,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -43,11 +42,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -75,14 +73,16 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -103,14 +103,16 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -140,18 +142,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -174,18 +179,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -217,18 +225,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -251,18 +262,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -293,14 +307,16 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -321,14 +337,16 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -358,18 +376,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -392,18 +413,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -435,18 +459,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -469,18 +496,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2dmsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -512,19 +542,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -547,19 +582,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -591,14 +631,16 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -619,14 +661,16 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -656,18 +700,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -690,18 +737,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -733,19 +783,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -768,19 +823,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -813,19 +873,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -848,19 +913,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -893,18 +963,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -927,18 +1000,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -970,19 +1046,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1005,19 +1086,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1054,11 +1140,10 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d ; GFX10NSA: bb.1.main_body: @@ -1079,11 +1164,10 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1111,14 +1195,16 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2d ; GFX10NSA: bb.1.main_body: @@ -1139,14 +1225,16 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1176,18 +1264,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_3d @@ -1210,18 +1301,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1253,18 +1347,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_cube @@ -1287,18 +1384,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1329,14 +1429,16 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1darray ; GFX10NSA: bb.1.main_body: @@ -1357,14 +1459,16 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1394,18 +1498,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2darray @@ -1428,18 +1535,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1471,18 +1581,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2dmsaa @@ -1505,18 +1618,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2dmsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1548,19 +1664,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_2darraymsaa @@ -1583,19 +1704,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1627,14 +1753,16 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_1d ; GFX10NSA: bb.1.main_body: @@ -1655,14 +1783,16 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY13]](s16), [[COPY14]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -1692,18 +1822,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_2d @@ -1726,18 +1859,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1769,19 +1905,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_3d @@ -1804,19 +1945,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1849,19 +1995,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_cube @@ -1884,19 +2035,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -1929,18 +2085,21 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_1darray @@ -1963,18 +2122,21 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[DEF]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -2006,19 +2168,24 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_mip_2darray @@ -2041,19 +2208,24 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY14:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY15:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY16:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY17:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY14]](s16), [[COPY15]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY16]](s16), [[COPY17]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: @@ -2080,11 +2252,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2105,11 +2276,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2137,11 +2307,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2162,11 +2331,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2194,11 +2362,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2219,11 +2386,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2251,11 +2417,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2276,11 +2441,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2308,11 +2472,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2333,11 +2496,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2365,11 +2527,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2390,11 +2551,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2422,11 +2582,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2447,11 +2606,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2479,11 +2637,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2504,11 +2661,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2536,11 +2692,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX10NSA-LABEL: name: load_1d_V1 @@ -2557,11 +2712,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX10NSA-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 main_body: @@ -2585,11 +2739,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2608,11 +2761,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2639,11 +2791,10 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (s32) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[COPY10]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (s32) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V1 ; GFX10NSA: bb.1.main_body: @@ -2660,11 +2811,10 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (s32) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[COPY10]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (s32) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2690,11 +2840,10 @@ ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<2 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[COPY11]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<2 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V2 ; GFX10NSA: bb.1.main_body: @@ -2713,11 +2862,10 @@ ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<2 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[COPY11]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store (<2 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2740,11 +2888,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2765,11 +2912,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2797,11 +2943,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2822,11 +2967,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2854,11 +2998,10 @@ ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2879,11 +3022,10 @@ ; GFX10NSA-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10NSA-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2916,11 +3058,10 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc ; GFX10NSA: bb.1.main_body: @@ -2941,11 +3082,10 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -2973,11 +3113,10 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_slc ; GFX10NSA: bb.1.main_body: @@ -2998,11 +3137,10 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -3030,11 +3168,10 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX9-NEXT: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc_slc ; GFX10NSA: bb.1.main_body: @@ -3055,11 +3192,10 @@ ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[COPY13]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store (<4 x s32>) into custom "ImageResource") ; GFX10NSA-NEXT: S_ENDPGM 0 main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -3112,11 +3248,10 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3139,11 +3274,10 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3176,14 +3310,16 @@ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3206,14 +3342,16 @@ ; GFX10NSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY9]](s16), [[COPY10]](s16) + ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) ; GFX10NSA-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3248,18 +3386,21 @@ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF1]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) @@ -3284,18 +3425,21 @@ ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF1]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[DEF1]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) @@ -3332,19 +3476,24 @@ ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) @@ -3369,19 +3518,24 @@ ; GFX10NSA-NEXT: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA-NEXT: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA-NEXT: [[COPY10:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX10NSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10NSA-NEXT: [[COPY11:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) ; GFX10NSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10NSA-NEXT: [[COPY12:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) ; GFX10NSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX10NSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[LSHR3]](s32) - ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10NSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10NSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10NSA-NEXT: [[COPY13:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY10]](s16), [[COPY11]](s16) + ; GFX10NSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY12]](s16), [[COPY13]](s16) + ; GFX10NSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10NSA-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10NSA-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA-NEXT: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll @@ -83,9 +83,11 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -111,9 +113,11 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -145,12 +149,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -177,12 +184,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -215,12 +225,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -247,12 +260,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -285,9 +301,11 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -313,9 +331,11 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1darray), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -347,12 +367,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -379,12 +402,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -418,10 +444,11 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -449,10 +476,11 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -486,10 +514,12 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -517,10 +547,12 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -553,9 +585,11 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -581,9 +615,11 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -615,12 +651,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -647,12 +686,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cl.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -686,10 +728,12 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -717,10 +761,12 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -754,13 +800,16 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -788,13 +837,16 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -826,11 +878,13 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -857,11 +911,13 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -894,12 +950,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -926,12 +985,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -964,13 +1026,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -997,13 +1061,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1035,14 +1101,17 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1069,14 +1138,17 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1108,12 +1180,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1140,12 +1215,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1178,14 +1256,18 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1212,14 +1294,18 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1251,14 +1337,17 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1285,14 +1374,17 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1324,16 +1416,20 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1360,16 +1456,20 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.b.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1401,13 +1501,16 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1434,13 +1537,16 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1472,15 +1578,21 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1507,15 +1619,21 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1547,22 +1665,31 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1589,22 +1716,31 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1638,14 +1774,17 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1673,14 +1812,17 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1713,16 +1855,22 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1750,16 +1898,22 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1791,14 +1945,18 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1825,14 +1983,18 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1864,18 +2026,25 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1902,18 +2071,25 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1946,15 +2122,19 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1982,15 +2162,19 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2023,19 +2207,26 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2063,19 +2254,26 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2107,13 +2305,16 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2140,13 +2341,16 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2178,15 +2382,21 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2213,15 +2423,21 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2254,14 +2470,17 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2289,14 +2508,17 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2329,16 +2551,22 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2366,16 +2594,22 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2407,14 +2641,18 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2441,14 +2679,18 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2480,18 +2722,25 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2518,18 +2767,25 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2562,15 +2818,19 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[CONCAT_VECTORS]](<8 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2598,15 +2858,19 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2639,19 +2903,26 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<10 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2679,19 +2950,26 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[COPY18]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2723,9 +3001,11 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2751,9 +3031,11 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -2785,12 +3067,15 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2817,12 +3102,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.l.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2856,10 +3144,12 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2887,10 +3177,12 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2924,13 +3216,16 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -2958,13 +3253,16 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.l.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -3056,9 +3354,11 @@ ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -3084,9 +3384,11 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -3119,10 +3421,11 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3150,10 +3453,11 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.1d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3187,10 +3491,12 @@ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3218,10 +3524,12 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.lz.2d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3256,20 +3564,27 @@ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX9-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3294,20 +3609,27 @@ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX10-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -3338,20 +3660,27 @@ ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -3378,20 +3707,27 @@ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY18]](s32) ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY19]](s32) ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY20]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll @@ -22,13 +22,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -54,13 +56,15 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -92,16 +96,20 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -127,16 +135,20 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -168,23 +180,29 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -211,23 +229,29 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY17]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.3d), 15, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -261,14 +285,16 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -295,14 +321,16 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -335,17 +363,21 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -372,17 +404,21 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -414,15 +450,17 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -448,15 +486,17 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -488,18 +528,22 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -525,18 +569,22 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.d.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -569,16 +617,18 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -605,16 +655,18 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -647,19 +699,23 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -687,19 +743,23 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -732,13 +792,15 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -764,13 +826,15 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -802,16 +866,20 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -837,16 +905,20 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -879,14 +951,16 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -913,14 +987,16 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -953,17 +1029,21 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -990,17 +1070,21 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.2d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1032,15 +1116,17 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1066,15 +1152,17 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY14]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.1d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1106,18 +1194,22 @@ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1143,18 +1235,22 @@ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13 ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32) ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.cd.cl.2d), 15, [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1187,16 +1283,18 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) - ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1223,16 +1321,18 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF]](s32) + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[DEF]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[DEF]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY16]](s32) - ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") + ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.1d), 15, [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX11-NEXT: $vgpr1 = COPY [[UV1]](s32) @@ -1265,19 +1365,23 @@ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1305,19 +1409,23 @@ ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY13]](s32) ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY17]](s32) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.cd.cl.2d), 15, [[CONCAT_VECTORS]](<12 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<4 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1352,20 +1460,24 @@ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX10-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX10-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1390,20 +1502,24 @@ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX11-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32) from custom "ImageResource") ; GFX11-NEXT: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -1434,20 +1550,24 @@ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX10-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX10-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) @@ -1474,20 +1594,24 @@ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX11-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX11-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32) ; GFX11-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32) ; GFX11-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY16]](s32) ; GFX11-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY17]](s32) ; GFX11-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6 ; GFX11-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7 ; GFX11-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8 ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY18]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY19]](s32) ; GFX11-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY20]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<14 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) ; GFX11-NEXT: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 6, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (<2 x s32>) from custom "ImageResource") ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX11-NEXT: $vgpr0 = COPY [[UV]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -261,14 +261,17 @@ ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; GFX9-NEXT: S_ENDPGM 0 ; GFX10-LABEL: name: image_store_v3f16 @@ -289,14 +292,17 @@ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY11]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX10-NEXT: G_AMDGPU_INTRIN_IMAGE_STORE_D16 intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store (<3 x s16>) into custom "ImageResource", align 8) ; GFX10-NEXT: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -3290,32 +3290,48 @@ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV1]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[LSHR5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) ; GFX9-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV2]](s32), [[LSHR6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR7]](s32), [[LSHR8]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC12]](s16), [[TRUNC13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) ; GFX9-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; GFX9-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) ; GFX9-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV3]](s32), [[LSHR9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR10]](s32), [[LSHR11]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) - ; GFX9-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>) + ; GFX9-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32) + ; GFX9-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; GFX9-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) + ; GFX9-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC15]](s16), [[TRUNC16]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC17]](s16), [[TRUNC18]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC19:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) + ; GFX9-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC4]](<4 x s8>), [[TRUNC9]](<4 x s8>), [[TRUNC14]](<4 x s8>), [[TRUNC19]](<4 x s8>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<16 x s8>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 16, addrspace 4) @@ -3442,60 +3458,92 @@ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV1]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[LSHR5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<4 x s16>) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) ; GFX9-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV2]](s32), [[LSHR6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR7]](s32), [[LSHR8]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; GFX9-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC12]](s16), [[TRUNC13]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC14:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS2]](<4 x s16>) ; GFX9-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) ; GFX9-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C1]](s32) ; GFX9-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV3]](s32), [[LSHR9]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR10]](s32), [[LSHR11]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32) + ; GFX9-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; GFX9-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) + ; GFX9-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC15]](s16), [[TRUNC16]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC17]](s16), [[TRUNC18]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC19:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS3]](<4 x s16>) ; GFX9-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32) ; GFX9-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C1]](s32) ; GFX9-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV4]](s32), [[LSHR12]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR13]](s32), [[LSHR14]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC8]](<2 x s16>), [[BUILD_VECTOR_TRUNC9]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS4]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[UV4]](s32) + ; GFX9-NEXT: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) + ; GFX9-NEXT: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; GFX9-NEXT: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC20]](s16), [[TRUNC21]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC22]](s16), [[TRUNC23]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR8]](<2 x s16>), [[BUILD_VECTOR9]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC24:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS4]](<4 x s16>) ; GFX9-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) ; GFX9-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C1]](s32) ; GFX9-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV5]](s32), [[LSHR15]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR16]](s32), [[LSHR17]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS5:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC10]](<2 x s16>), [[BUILD_VECTOR_TRUNC11]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS5]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[UV5]](s32) + ; GFX9-NEXT: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32) + ; GFX9-NEXT: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR16]](s32) + ; GFX9-NEXT: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC25]](s16), [[TRUNC26]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC27]](s16), [[TRUNC28]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS5:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR10]](<2 x s16>), [[BUILD_VECTOR11]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC29:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS5]](<4 x s16>) ; GFX9-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C]](s32) ; GFX9-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C1]](s32) ; GFX9-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[UV6]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV6]](s32), [[LSHR18]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR19]](s32), [[LSHR20]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS6:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC12]](<2 x s16>), [[BUILD_VECTOR_TRUNC13]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS6]](<4 x s16>) + ; GFX9-NEXT: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[UV6]](s32) + ; GFX9-NEXT: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32) + ; GFX9-NEXT: [[TRUNC32:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) + ; GFX9-NEXT: [[TRUNC33:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR20]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC30]](s16), [[TRUNC31]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC32]](s16), [[TRUNC33]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS6:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR12]](<2 x s16>), [[BUILD_VECTOR13]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC34:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS6]](<4 x s16>) ; GFX9-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[UV7]], [[C]](s32) ; GFX9-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[UV7]], [[C1]](s32) ; GFX9-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[UV7]], [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UV7]](s32), [[LSHR21]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR22]](s32), [[LSHR23]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS7:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC14]](<2 x s16>), [[BUILD_VECTOR_TRUNC15]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS7]](<4 x s16>) - ; GFX9-NEXT: [[CONCAT_VECTORS8:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>), [[TRUNC4]](<4 x s8>), [[TRUNC5]](<4 x s8>), [[TRUNC6]](<4 x s8>), [[TRUNC7]](<4 x s8>) + ; GFX9-NEXT: [[TRUNC35:%[0-9]+]]:_(s16) = G_TRUNC [[UV7]](s32) + ; GFX9-NEXT: [[TRUNC36:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32) + ; GFX9-NEXT: [[TRUNC37:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR22]](s32) + ; GFX9-NEXT: [[TRUNC38:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC35]](s16), [[TRUNC36]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC37]](s16), [[TRUNC38]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS7:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR14]](<2 x s16>), [[BUILD_VECTOR15]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC39:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS7]](<4 x s16>) + ; GFX9-NEXT: [[CONCAT_VECTORS8:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[TRUNC4]](<4 x s8>), [[TRUNC9]](<4 x s8>), [[TRUNC14]](<4 x s8>), [[TRUNC19]](<4 x s8>), [[TRUNC24]](<4 x s8>), [[TRUNC29]](<4 x s8>), [[TRUNC34]](<4 x s8>), [[TRUNC39]](<4 x s8>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS8]](<32 x s8>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<32 x s8>) = G_LOAD %0 :: (load (<32 x s8>), align 32, addrspace 4) @@ -3575,11 +3623,13 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s16) from unknown-address + 2, addrspace 4) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 4) $vgpr0 = COPY %1 @@ -3654,6 +3704,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p4) :: (load (s8) from unknown-address + 2, addrspace 4) @@ -3661,8 +3712,9 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load (s8) from unknown-address + 3, addrspace 4) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 4) $vgpr0 = COPY %1 @@ -3747,18 +3799,24 @@ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load (<4 x s16>), addrspace 4) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 4) @@ -3848,22 +3906,28 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s16) from unknown-address + 2, addrspace 4) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load (s16) from unknown-address + 4, align 4, addrspace 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 4) @@ -3953,22 +4017,28 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s16) from unknown-address + 2, addrspace 4) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load (s16) from unknown-address + 4, addrspace 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 4) @@ -4092,6 +4162,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p4) :: (load (s8) from unknown-address + 2, addrspace 4) @@ -4099,6 +4170,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load (s8) from unknown-address + 3, addrspace 4) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4) @@ -4106,16 +4178,20 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load (s8) from unknown-address + 5, addrspace 4) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 4) @@ -4249,18 +4325,22 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load (s16) from unknown-address + 2, addrspace 4) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load (s16) from unknown-address + 4, addrspace 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load (s16) from unknown-address + 6, addrspace 4) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 4) @@ -4376,6 +4456,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p4) :: (load (s8) from unknown-address + 2, addrspace 4) @@ -4383,6 +4464,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load (s8) from unknown-address + 3, addrspace 4) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4) @@ -4390,6 +4472,7 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p4) :: (load (s8) from unknown-address + 5, addrspace 4) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p4) :: (load (s8) from unknown-address + 6, addrspace 4) @@ -4397,9 +4480,10 @@ ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p4) :: (load (s8) from unknown-address + 7, addrspace 4) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -3697,11 +3697,13 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 0) $vgpr0 = COPY %1 @@ -3776,6 +3778,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) @@ -3783,8 +3786,9 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 0) $vgpr0 = COPY %1 @@ -3869,18 +3873,24 @@ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>)) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 0) @@ -3970,22 +3980,28 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 0) @@ -4075,22 +4091,28 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 0) @@ -4214,6 +4236,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) @@ -4221,6 +4244,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -4228,16 +4252,20 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 0) @@ -4371,18 +4399,22 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 0) @@ -4498,6 +4530,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) @@ -4505,6 +4538,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) @@ -4512,6 +4546,7 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) @@ -4519,9 +4554,10 @@ ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -4458,11 +4458,15 @@ ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-HSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -4474,11 +4478,15 @@ ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-MESA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -4572,11 +4580,15 @@ ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-HSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align2 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -4593,11 +4605,15 @@ ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-MESA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 2, addrspace 1) $vgpr0 = COPY %1 @@ -4718,11 +4734,15 @@ ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-HSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align1 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -4748,11 +4768,15 @@ ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C3]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 ; GFX9-MESA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C4]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 1, addrspace 1) $vgpr0 = COPY %1 @@ -6221,11 +6245,13 @@ ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 1) $vgpr0 = COPY %1 @@ -6338,6 +6364,7 @@ ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p1) :: (load (s8) from unknown-address + 2, addrspace 1) @@ -6345,8 +6372,9 @@ ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s8) from unknown-address + 3, addrspace 1) ; GFX9-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 1) $vgpr0 = COPY %1 @@ -6497,18 +6525,24 @@ ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1) ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-HSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align8 ; GFX9-MESA: liveins: $vgpr0_vgpr1 @@ -6517,18 +6551,24 @@ ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1) ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-MESA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 1) @@ -6688,44 +6728,56 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 1) @@ -6885,44 +6937,56 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align2 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 1) @@ -7124,22 +7188,28 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align1 ; GFX9-MESA: liveins: $vgpr0_vgpr1 @@ -7152,6 +7222,7 @@ ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p1) :: (load (s8) from unknown-address + 2, addrspace 1) @@ -7159,6 +7230,7 @@ ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s8) from unknown-address + 3, addrspace 1) ; GFX9-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -7166,16 +7238,20 @@ ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s8) from unknown-address + 5, addrspace 1) ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 1) @@ -7385,18 +7461,22 @@ ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 1) @@ -7570,6 +7650,7 @@ ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p1) :: (load (s8) from unknown-address + 2, addrspace 1) @@ -7577,6 +7658,7 @@ ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s8) from unknown-address + 3, addrspace 1) ; GFX9-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -7584,6 +7666,7 @@ ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s8) from unknown-address + 5, addrspace 1) ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p1) :: (load (s8) from unknown-address + 6, addrspace 1) @@ -7591,9 +7674,10 @@ ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load (s8) from unknown-address + 7, addrspace 1) ; GFX9-MESA-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 1) @@ -7754,20 +7838,26 @@ ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BITCAST]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[BITCAST4]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align16 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -7776,20 +7866,26 @@ ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BITCAST]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[BITCAST4]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<5 x s16>) = G_LOAD %0 :: (load (<5 x s16>), align 16, addrspace 1) %2:_(<5 x s16>) = G_IMPLICIT_DEF @@ -7968,53 +8064,65 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align8 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<5 x s16>) = G_LOAD %0 :: (load (<5 x s16>), align 8, addrspace 1) %2:_(<5 x s16>) = G_IMPLICIT_DEF @@ -8193,53 +8301,65 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<5 x s16>) = G_LOAD %0 :: (load (<5 x s16>), align 4, addrspace 1) %2:_(<5 x s16>) = G_IMPLICIT_DEF @@ -8422,53 +8542,65 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align2 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<5 x s16>) = G_LOAD %0 :: (load (<5 x s16>), align 2, addrspace 1) %2:_(<5 x s16>) = G_IMPLICIT_DEF @@ -8717,27 +8849,33 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align1 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -8749,6 +8887,7 @@ ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p1) :: (load (s8) from unknown-address + 2, addrspace 1) @@ -8756,6 +8895,7 @@ ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s8) from unknown-address + 3, addrspace 1) ; GFX9-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -8763,6 +8903,7 @@ ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s8) from unknown-address + 5, addrspace 1) ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p1) :: (load (s8) from unknown-address + 6, addrspace 1) @@ -8770,6 +8911,7 @@ ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load (s8) from unknown-address + 7, addrspace 1) ; GFX9-MESA-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) ; GFX9-MESA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p1) :: (load (s8) from unknown-address + 8, addrspace 1) @@ -8777,15 +8919,17 @@ ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load (s8) from unknown-address + 9, addrspace 1) ; GFX9-MESA-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[OR4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR4]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<5 x s16>) = G_LOAD %0 :: (load (<5 x s16>), align 1, addrspace 1) %2:_(<5 x s16>) = G_IMPLICIT_DEF @@ -9532,24 +9676,32 @@ ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BITCAST]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-HSA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-HSA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align16 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -9558,24 +9710,32 @@ ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BITCAST]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-MESA-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-MESA-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>), [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<7 x s16>) = G_LOAD %0 :: (load (<7 x s16>), align 16, addrspace 1) %2:_(<7 x s16>) = G_IMPLICIT_DEF @@ -9807,69 +9967,85 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-HSA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-HSA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-HSA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-HSA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-HSA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-HSA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align8 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-MESA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<7 x s16>) = G_LOAD %0 :: (load (<7 x s16>), align 8, addrspace 1) %2:_(<7 x s16>) = G_IMPLICIT_DEF @@ -10101,69 +10277,85 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-HSA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-HSA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-HSA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-HSA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-HSA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-HSA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, align 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-MESA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, align 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<7 x s16>) = G_LOAD %0 :: (load (<7 x s16>), align 4, addrspace 1) %2:_(<7 x s16>) = G_IMPLICIT_DEF @@ -10395,69 +10587,85 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-HSA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-HSA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-HSA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-HSA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-HSA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-HSA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align2 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-MESA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, addrspace 1) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<7 x s16>) = G_LOAD %0 :: (load (<7 x s16>), align 2, addrspace 1) %2:_(<7 x s16>) = G_IMPLICIT_DEF @@ -10779,35 +10987,43 @@ ; GFX9-HSA-NEXT: {{ $}} ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-HSA-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64) ; GFX9-HSA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s16) from unknown-address + 2, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-HSA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64) ; GFX9-HSA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s16) from unknown-address + 4, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-HSA-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-HSA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s16) from unknown-address + 6, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-HSA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-HSA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s16) from unknown-address + 8, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-HSA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-HSA-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-HSA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s16) from unknown-address + 10, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-HSA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-HSA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-HSA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p1) :: (load (s16) from unknown-address + 12, align 1, addrspace 1) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-HSA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD4]](s32), [[LOAD5]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD6]](s32), [[BITCAST]](s32) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align1 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -10819,6 +11035,7 @@ ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p1) :: (load (s8) from unknown-address + 2, addrspace 1) @@ -10826,6 +11043,7 @@ ; GFX9-MESA-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s8) from unknown-address + 3, addrspace 1) ; GFX9-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1) @@ -10833,6 +11051,7 @@ ; GFX9-MESA-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p1) :: (load (s8) from unknown-address + 5, addrspace 1) ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p1) :: (load (s8) from unknown-address + 6, addrspace 1) @@ -10840,6 +11059,7 @@ ; GFX9-MESA-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p1) :: (load (s8) from unknown-address + 7, addrspace 1) ; GFX9-MESA-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) ; GFX9-MESA-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p1) :: (load (s8) from unknown-address + 8, addrspace 1) @@ -10847,6 +11067,7 @@ ; GFX9-MESA-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p1) :: (load (s8) from unknown-address + 9, addrspace 1) ; GFX9-MESA-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[OR4]](s32) ; GFX9-MESA-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C6]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p1) :: (load (s8) from unknown-address + 10, addrspace 1) @@ -10854,6 +11075,7 @@ ; GFX9-MESA-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p1) :: (load (s8) from unknown-address + 11, addrspace 1) ; GFX9-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[OR5]](s32) ; GFX9-MESA-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 ; GFX9-MESA-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C7]](s64) ; GFX9-MESA-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p1) :: (load (s8) from unknown-address + 12, addrspace 1) @@ -10861,17 +11083,19 @@ ; GFX9-MESA-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p1) :: (load (s8) from unknown-address + 13, addrspace 1) ; GFX9-MESA-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) ; GFX9-MESA-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[OR6]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF ; GFX9-MESA-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<8 x s16>) ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR4]](s32), [[OR5]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR6]](s32), [[BITCAST]](s32) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-MESA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<7 x s16>) = G_LOAD %0 :: (load (<7 x s16>), align 1, addrspace 1) %2:_(<7 x s16>) = G_IMPLICIT_DEF @@ -16780,25 +17004,57 @@ ; GFX9-HSA-NEXT: [[LSHR29:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C29]](s32) ; GFX9-HSA-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX9-HSA-NEXT: [[LSHR30:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C30]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[LSHR4]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR5]](s32), [[LSHR6]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR7]](s32), [[LSHR8]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR9]](s32), [[LSHR10]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR11]](s32), [[LSHR12]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR13]](s32), [[LSHR14]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR15]](s32), [[LSHR16]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR17]](s32), [[LSHR18]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR19]](s32), [[LSHR20]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR21]](s32), [[LSHR22]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR23]](s32), [[LSHR24]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR25]](s32), [[LSHR26]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR27]](s32), [[LSHR28]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR29]](s32), [[LSHR30]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>), [[BUILD_VECTOR_TRUNC8]](<2 x s16>), [[BUILD_VECTOR_TRUNC9]](<2 x s16>), [[BUILD_VECTOR_TRUNC10]](<2 x s16>), [[BUILD_VECTOR_TRUNC11]](<2 x s16>), [[BUILD_VECTOR_TRUNC12]](<2 x s16>), [[BUILD_VECTOR_TRUNC13]](<2 x s16>), [[BUILD_VECTOR_TRUNC14]](<2 x s16>), [[BUILD_VECTOR_TRUNC15]](<2 x s16>) - ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<32 x s16>) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-HSA-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; GFX9-HSA-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; GFX9-HSA-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; GFX9-HSA-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) + ; GFX9-HSA-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; GFX9-HSA-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) + ; GFX9-HSA-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; GFX9-HSA-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32) + ; GFX9-HSA-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32) + ; GFX9-HSA-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR16]](s32) + ; GFX9-HSA-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32) + ; GFX9-HSA-NEXT: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32) + ; GFX9-HSA-NEXT: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) + ; GFX9-HSA-NEXT: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR20]](s32) + ; GFX9-HSA-NEXT: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32) + ; GFX9-HSA-NEXT: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR22]](s32) + ; GFX9-HSA-NEXT: [[TRUNC24:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32) + ; GFX9-HSA-NEXT: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR24]](s32) + ; GFX9-HSA-NEXT: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR25]](s32) + ; GFX9-HSA-NEXT: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR26]](s32) + ; GFX9-HSA-NEXT: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR27]](s32) + ; GFX9-HSA-NEXT: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR28]](s32) + ; GFX9-HSA-NEXT: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR29]](s32) + ; GFX9-HSA-NEXT: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR30]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC12]](s16), [[TRUNC13]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC14]](s16), [[TRUNC15]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC16]](s16), [[TRUNC17]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC18]](s16), [[TRUNC19]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC20]](s16), [[TRUNC21]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC22]](s16), [[TRUNC23]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC24]](s16), [[TRUNC25]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC26]](s16), [[TRUNC27]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC28]](s16), [[TRUNC29]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC30]](s16), [[TRUNC31]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>), [[BUILD_VECTOR8]](<2 x s16>), [[BUILD_VECTOR9]](<2 x s16>), [[BUILD_VECTOR10]](<2 x s16>), [[BUILD_VECTOR11]](<2 x s16>), [[BUILD_VECTOR12]](<2 x s16>), [[BUILD_VECTOR13]](<2 x s16>), [[BUILD_VECTOR14]](<2 x s16>), [[BUILD_VECTOR15]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC32:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<32 x s16>) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC32]](<32 x s1>) ; GFX9-MESA-LABEL: name: test_load_global_v32s1_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -16866,25 +17122,57 @@ ; GFX9-MESA-NEXT: [[LSHR29:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C29]](s32) ; GFX9-MESA-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 ; GFX9-MESA-NEXT: [[LSHR30:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C30]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[LSHR4]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR5]](s32), [[LSHR6]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR7]](s32), [[LSHR8]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR9]](s32), [[LSHR10]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR11]](s32), [[LSHR12]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR13]](s32), [[LSHR14]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR15]](s32), [[LSHR16]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR17]](s32), [[LSHR18]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR19]](s32), [[LSHR20]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR21]](s32), [[LSHR22]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR23]](s32), [[LSHR24]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR25]](s32), [[LSHR26]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR27]](s32), [[LSHR28]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR29]](s32), [[LSHR30]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>), [[BUILD_VECTOR_TRUNC8]](<2 x s16>), [[BUILD_VECTOR_TRUNC9]](<2 x s16>), [[BUILD_VECTOR_TRUNC10]](<2 x s16>), [[BUILD_VECTOR_TRUNC11]](<2 x s16>), [[BUILD_VECTOR_TRUNC12]](<2 x s16>), [[BUILD_VECTOR_TRUNC13]](<2 x s16>), [[BUILD_VECTOR_TRUNC14]](<2 x s16>), [[BUILD_VECTOR_TRUNC15]](<2 x s16>) - ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<32 x s16>) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-MESA-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; GFX9-MESA-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; GFX9-MESA-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; GFX9-MESA-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) + ; GFX9-MESA-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; GFX9-MESA-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) + ; GFX9-MESA-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; GFX9-MESA-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32) + ; GFX9-MESA-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32) + ; GFX9-MESA-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR16]](s32) + ; GFX9-MESA-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32) + ; GFX9-MESA-NEXT: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32) + ; GFX9-MESA-NEXT: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) + ; GFX9-MESA-NEXT: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR20]](s32) + ; GFX9-MESA-NEXT: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32) + ; GFX9-MESA-NEXT: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR22]](s32) + ; GFX9-MESA-NEXT: [[TRUNC24:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32) + ; GFX9-MESA-NEXT: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR24]](s32) + ; GFX9-MESA-NEXT: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR25]](s32) + ; GFX9-MESA-NEXT: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR26]](s32) + ; GFX9-MESA-NEXT: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR27]](s32) + ; GFX9-MESA-NEXT: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR28]](s32) + ; GFX9-MESA-NEXT: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR29]](s32) + ; GFX9-MESA-NEXT: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR30]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC12]](s16), [[TRUNC13]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC14]](s16), [[TRUNC15]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC16]](s16), [[TRUNC17]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC18]](s16), [[TRUNC19]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC20]](s16), [[TRUNC21]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR11:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC22]](s16), [[TRUNC23]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR12:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC24]](s16), [[TRUNC25]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR13:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC26]](s16), [[TRUNC27]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR14:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC28]](s16), [[TRUNC29]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR15:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC30]](s16), [[TRUNC31]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>), [[BUILD_VECTOR8]](<2 x s16>), [[BUILD_VECTOR9]](<2 x s16>), [[BUILD_VECTOR10]](<2 x s16>), [[BUILD_VECTOR11]](<2 x s16>), [[BUILD_VECTOR12]](<2 x s16>), [[BUILD_VECTOR13]](<2 x s16>), [[BUILD_VECTOR14]](<2 x s16>), [[BUILD_VECTOR15]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC32:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<32 x s16>) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC32]](<32 x s1>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<32 x s1>) = G_LOAD %0 :: (load (<32 x s1>), align 4, addrspace 1) $vgpr0 = COPY %1 @@ -17003,13 +17291,21 @@ ; GFX9-HSA-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C5]](s32) ; GFX9-HSA-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 ; GFX9-HSA-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C6]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[LSHR4]](s32) - ; GFX9-HSA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR5]](s32), [[LSHR6]](s32) - ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) - ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>) - ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>) + ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-HSA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-HSA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-HSA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-HSA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-HSA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-HSA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-HSA-NEXT: [[TRUNC8:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>) + ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC8]](<8 x s4>) ; GFX9-MESA-LABEL: name: test_load_global_v8s4_align4 ; GFX9-MESA: liveins: $vgpr0_vgpr1 ; GFX9-MESA-NEXT: {{ $}} @@ -17029,13 +17325,21 @@ ; GFX9-MESA-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C5]](s32) ; GFX9-MESA-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 ; GFX9-MESA-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C6]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LSHR]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[LSHR2]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[LSHR4]](s32) - ; GFX9-MESA-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR5]](s32), [[LSHR6]](s32) - ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) - ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>) - ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>) + ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-MESA-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-MESA-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-MESA-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-MESA-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; GFX9-MESA-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-MESA-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-MESA-NEXT: [[TRUNC8:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>) + ; GFX9-MESA-NEXT: $vgpr0 = COPY [[TRUNC8]](<8 x s4>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<8 x s4>) = G_LOAD %0 :: (load (<8 x s4>), align 4, addrspace 1) $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -9416,11 +9416,13 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s16_align2 ; GFX9-UNALIGNED: liveins: $vgpr0 ; GFX9-UNALIGNED-NEXT: {{ $}} @@ -9432,11 +9434,13 @@ ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align2 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} @@ -9448,11 +9452,13 @@ ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s16_align2 ; GFX11-UNALIGNED: liveins: $vgpr0 ; GFX11-UNALIGNED-NEXT: {{ $}} @@ -9585,6 +9591,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -9592,8 +9599,9 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s16_align1 ; GFX9-UNALIGNED: liveins: $vgpr0 ; GFX9-UNALIGNED-NEXT: {{ $}} @@ -9611,6 +9619,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -9618,8 +9627,9 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align1 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} @@ -9637,6 +9647,7 @@ ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -9644,8 +9655,9 @@ ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s16_align1 ; GFX11-UNALIGNED: liveins: $vgpr0 ; GFX11-UNALIGNED-NEXT: {{ $}} @@ -9802,18 +9814,24 @@ ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align8 ; GFX9-UNALIGNED: liveins: $vgpr0 @@ -9822,18 +9840,24 @@ ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX9-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-UNALIGNED-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-UNALIGNED-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_local_v3s16_align8 ; GFX10: liveins: $vgpr0 @@ -9842,18 +9866,24 @@ ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align8 ; GFX10-UNALIGNED: liveins: $vgpr0 @@ -9862,18 +9892,24 @@ ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX10-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX10-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-UNALIGNED-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-UNALIGNED-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX10-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX10-UNALIGNED-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_local_v3s16_align8 ; GFX11: liveins: $vgpr0 @@ -9882,18 +9918,24 @@ ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align8 ; GFX11-UNALIGNED: liveins: $vgpr0 @@ -9902,18 +9944,24 @@ ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3) ; GFX11-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX11-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX11-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX11-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-UNALIGNED-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-UNALIGNED-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX11-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX11-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX11-UNALIGNED-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 3) @@ -10073,132 +10121,168 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align2 ; GFX9-UNALIGNED: liveins: $vgpr0 ; GFX9-UNALIGNED-NEXT: {{ $}} ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_local_v3s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align2 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX10-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_local_v3s16_align2 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align2 ; GFX11-UNALIGNED: liveins: $vgpr0 ; GFX11-UNALIGNED-NEXT: {{ $}} ; GFX11-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX11-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 3) @@ -10420,6 +10504,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -10427,6 +10512,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -10434,38 +10520,48 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align1 ; GFX9-UNALIGNED: liveins: $vgpr0 ; GFX9-UNALIGNED-NEXT: {{ $}} ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, align 1, addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 1, addrspace 3) + ; GFX9-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_local_v3s16_align1 ; GFX10: liveins: $vgpr0 @@ -10478,6 +10574,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -10485,6 +10582,7 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -10492,38 +10590,48 @@ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align1 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX10-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_local_v3s16_align1 ; GFX11: liveins: $vgpr0 @@ -10536,6 +10644,7 @@ ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -10543,6 +10652,7 @@ ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -10550,38 +10660,48 @@ ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align1 ; GFX11-UNALIGNED: liveins: $vgpr0 ; GFX11-UNALIGNED-NEXT: {{ $}} ; GFX11-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, align 1, addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 1, addrspace 3) + ; GFX11-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-UNALIGNED-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX11-UNALIGNED-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-UNALIGNED-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-UNALIGNED-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 3) @@ -10729,36 +10849,44 @@ ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 4, addrspace 3) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align4 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 4, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-LABEL: name: test_load_local_v4s16_align4 ; GFX11: liveins: $vgpr0 @@ -10899,18 +11027,22 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align2 ; GFX9-UNALIGNED: liveins: $vgpr0 @@ -10923,54 +11055,66 @@ ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align2 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-LABEL: name: test_load_local_v4s16_align2 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, addrspace 3) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align2 ; GFX11-UNALIGNED: liveins: $vgpr0 @@ -11184,6 +11328,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -11191,6 +11336,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -11198,6 +11344,7 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p3) :: (load (s8) from unknown-address + 6, addrspace 3) @@ -11205,9 +11352,10 @@ ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load (s8) from unknown-address + 7, addrspace 3) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align1 ; GFX9-UNALIGNED: liveins: $vgpr0 @@ -11226,6 +11374,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -11233,6 +11382,7 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -11240,6 +11390,7 @@ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p3) :: (load (s8) from unknown-address + 6, addrspace 3) @@ -11247,27 +11398,32 @@ ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load (s8) from unknown-address + 7, addrspace 3) ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align1 ; GFX10-UNALIGNED: liveins: $vgpr0 ; GFX10-UNALIGNED-NEXT: {{ $}} ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 1, addrspace 3) + ; GFX10-UNALIGNED-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX10-UNALIGNED-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, align 1, addrspace 3) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-UNALIGNED-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-LABEL: name: test_load_local_v4s16_align1 ; GFX11: liveins: $vgpr0 @@ -11280,6 +11436,7 @@ ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p3) :: (load (s8) from unknown-address + 2, addrspace 3) @@ -11287,6 +11444,7 @@ ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s8) from unknown-address + 3, addrspace 3) ; GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p3) :: (load (s8) from unknown-address + 4, addrspace 3) @@ -11294,6 +11452,7 @@ ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load (s8) from unknown-address + 5, addrspace 3) ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32) ; GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p3) :: (load (s8) from unknown-address + 6, addrspace 3) @@ -11301,9 +11460,10 @@ ; GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load (s8) from unknown-address + 7, addrspace 3) ; GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align1 ; GFX11-UNALIGNED: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -6349,21 +6349,25 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_load_private_v2s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX11-LABEL: name: test_load_private_v2s16_align2 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} @@ -6470,6 +6474,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -6477,8 +6482,9 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_load_private_v2s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -6490,6 +6496,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -6497,8 +6504,9 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX11-LABEL: name: test_load_private_v2s16_align1 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} @@ -6626,18 +6634,24 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD1]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_private_v3s16_align8 ; GFX10: liveins: $vgpr0 @@ -6647,18 +6661,24 @@ ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD1]](s32), [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_private_v3s16_align8 ; GFX11: liveins: $vgpr0 @@ -6667,18 +6687,24 @@ ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), addrspace 5) ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX11-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[BITCAST2]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST3]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 5) @@ -6803,66 +6829,84 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_private_v3s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_private_v3s16_align2 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 5) @@ -7035,6 +7079,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -7042,6 +7087,7 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) @@ -7049,16 +7095,20 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX10-LABEL: name: test_load_private_v3s16_align1 ; GFX10: liveins: $vgpr0 @@ -7071,6 +7121,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -7078,6 +7129,7 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) @@ -7085,38 +7137,48 @@ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[BITCAST]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX11-LABEL: name: test_load_private_v3s16_align1 ; GFX11: liveins: $vgpr0 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[BITCAST]](s32) - ; GFX11-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[BITCAST1]](s32) - ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 5) @@ -7349,34 +7411,42 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-LABEL: name: test_load_private_v4s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD]](s32), [[LOAD1]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LOAD2]](s32), [[LOAD3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-LABEL: name: test_load_private_v4s16_align2 ; GFX11: liveins: $vgpr0 @@ -7541,6 +7611,7 @@ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -7548,7 +7619,8 @@ ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) @@ -7556,14 +7628,16 @@ ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX10-LABEL: name: test_load_private_v4s16_align1 ; GFX10: liveins: $vgpr0 @@ -7576,6 +7650,7 @@ ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) @@ -7583,7 +7658,8 @@ ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) @@ -7591,14 +7667,16 @@ ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[OR2]](s32), [[OR3]](s32) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX11-LABEL: name: test_load_private_v4s16_align1 ; GFX11: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir @@ -656,10 +656,8 @@ ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC2]](s16) ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[TRUNC3]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[LSHR1]](s16), [[LSHR2]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s32>) = COPY $vgpr0_vgpr1 %2:_(<2 x s16>) = G_LSHR %0, %1 @@ -759,29 +757,35 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[TRUNC5]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR4]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[LSHR5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[LSHR3]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -897,31 +901,39 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[TRUNC5]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR4]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR5]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[LSHR3]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-mul.mir @@ -469,22 +469,27 @@ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) ; GFX10-LABEL: name: test_mul_v3s16 ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -494,22 +499,27 @@ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY6]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF]](s32) - ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX10-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[MUL]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[MUL1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16) + ; GFX10-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -249,23 +249,26 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[SHL]], [[SHL1]] - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SADDSAT]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SADDSAT]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -587,31 +590,43 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[SADDSAT:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[SADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_SADDSAT [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[SADDSAT]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SADDSAT1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir @@ -2888,6 +2888,7 @@ ; GFX9-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[ASHR]], [[ASHR1]] ; GFX9-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[SELECT2]], [[XOR2]] ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[XOR3]], [[XOR2]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB3]](s32) ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 16 ; GFX9-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[C1]](s32) @@ -2918,8 +2919,9 @@ ; GFX9-NEXT: [[XOR6:%[0-9]+]]:_(s32) = G_XOR [[ASHR2]], [[ASHR3]] ; GFX9-NEXT: [[XOR7:%[0-9]+]]:_(s32) = G_XOR [[SELECT5]], [[XOR6]] ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[XOR7]], [[XOR6]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB3]](s32), [[SUB7]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_sdiv_v2s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -2964,6 +2966,7 @@ ; GFX10-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[ASHR]], [[ASHR1]] ; GFX10-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[SELECT2]], [[XOR2]] ; GFX10-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[XOR3]], [[XOR2]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB3]](s32) ; GFX10-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 ; GFX10-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 16 ; GFX10-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[C1]](s32) @@ -2994,8 +2997,9 @@ ; GFX10-NEXT: [[XOR6:%[0-9]+]]:_(s32) = G_XOR [[ASHR2]], [[ASHR3]] ; GFX10-NEXT: [[XOR7:%[0-9]+]]:_(s32) = G_XOR [[SELECT5]], [[XOR6]] ; GFX10-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[XOR7]], [[XOR6]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB3]](s32), [[SUB7]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB7]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SDIV %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir @@ -772,11 +772,10 @@ ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[C]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<2 x s16>) ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](<2 x s16>) ; GFX8-LABEL: name: test_sext_inreg_v2s16_1 ; GFX8: liveins: $vgpr0 @@ -830,34 +829,40 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[C1]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]](<2 x s16>) - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR3]](<2 x s16>) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[BITCAST4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) ; GFX8-LABEL: name: test_sext_inreg_v3s16_1 ; GFX8: liveins: $vgpr0_vgpr1_vgpr2 @@ -1040,16 +1045,13 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[C]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ASHR]](<2 x s16>), [[ASHR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; GFX8-LABEL: name: test_sext_inreg_v4s16_1 @@ -1127,21 +1129,16 @@ ; GFX9-LABEL: name: test_sext_inreg_v6s16_1 ; GFX9: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 - ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[C]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV2]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[ASHR2:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL2]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV2]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[ASHR2:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL2]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ASHR]](<2 x s16>), [[ASHR1]](<2 x s16>), [[ASHR2]](<2 x s16>) ; GFX9-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>) ; GFX8-LABEL: name: test_sext_inreg_v6s16_1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -643,10 +643,8 @@ ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC2]](s16) ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[TRUNC3]](s16) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SHL]](s16), [[SHL1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s32>) = COPY $vgpr0_vgpr1 %2:_(<2 x s16>) = G_SHL %0, %1 @@ -754,31 +752,39 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<6 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[TRUNC5]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SHL]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SHL1]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC9]](s16), [[TRUNC10]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<6 x s16>) = COPY $vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir @@ -240,16 +240,14 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -274,15 +272,13 @@ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -313,18 +309,14 @@ ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] - ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -50,14 +50,12 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -88,15 +86,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_0 @@ -126,16 +122,14 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -167,15 +161,13 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -206,16 +198,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -285,16 +274,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -325,14 +311,12 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32) + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -363,15 +347,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] + ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_2 @@ -401,16 +383,14 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -442,15 +422,13 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -594,16 +572,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -612,13 +587,14 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 2) @@ -639,16 +615,13 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]] + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -680,15 +653,13 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -698,13 +669,15 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(3, 0) @@ -725,16 +698,14 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -743,14 +714,16 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 3) @@ -772,15 +745,13 @@ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -790,13 +761,15 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1, 2) @@ -817,16 +790,14 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -835,14 +806,16 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2, 1) @@ -870,18 +843,14 @@ ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] - ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32) ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) @@ -894,18 +863,22 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC3]](s16) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s16) = COPY [[TRUNC2]](s16) + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY4]](s16), [[COPY5]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -929,20 +902,15 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C1]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) - ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]] - ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]] - ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]] + ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -951,18 +919,17 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C1]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1, 0) @@ -983,15 +950,12 @@ ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) - ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) - ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) - ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) + ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] @@ -1004,17 +968,18 @@ ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>) - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) - ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[COPY2]](s16), [[COPY3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1, 3) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir @@ -433,26 +433,32 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SMAX]](<2 x s16>) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[SMAX1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR4]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir @@ -433,26 +433,32 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SMIN]](<2 x s16>) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[SMIN1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR4]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_SMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir @@ -197,11 +197,8 @@ ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16 ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]] ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ASHR]](s32), [[ASHR1]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16 - ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 + ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 + ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -257,25 +254,27 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) ; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY2]], 8 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG2]](s32) ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY3]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -381,40 +380,45 @@ ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32) ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32) ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32) ; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG2]](s32) ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR3]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 8 + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG4]](s32) ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR2]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR4]], 8 + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG6]](s32) ; GFX9-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR5]], 8 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SEXT_INREG6]](s32), [[SEXT_INREG7]](s32) - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL1]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR3]], [[BUILD_VECTOR4]] + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[MUL1]], [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR1]](<2 x s16>) ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C4]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]] ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C3]] + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C4]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir @@ -252,17 +252,13 @@ ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG3]], [[SEXT_INREG4]] ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 16 ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[SEXT_INREG5]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[MUL]](s32), [[MUL1]](s32) ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) ; GFX9-NEXT: [[SEXT_INREG6:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 1 ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) ; GFX9-NEXT: [[SEXT_INREG7:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1 ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG6]](s32), [[SEXT_INREG7]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16 - ; GFX9-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 + ; GFX9-NEXT: [[SEXT_INREG8:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL]], 16 + ; GFX9-NEXT: [[SEXT_INREG9:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 16 ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG8]](s32), [[SEXT_INREG9]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>) ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir @@ -2762,6 +2762,7 @@ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] ; GFX9-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[ASHR]] ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[XOR2]], [[ASHR]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB4]](s32) ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 16 ; GFX9-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[C1]](s32) @@ -2789,8 +2790,9 @@ ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB8]], [[SELECT2]] ; GFX9-NEXT: [[XOR5:%[0-9]+]]:_(s32) = G_XOR [[SELECT3]], [[ASHR2]] ; GFX9-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[XOR5]], [[ASHR2]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB4]](s32), [[SUB9]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB9]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_srem_v2s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -2831,6 +2833,7 @@ ; GFX10-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] ; GFX10-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[ASHR]] ; GFX10-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[XOR2]], [[ASHR]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB4]](s32) ; GFX10-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16 ; GFX10-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 16 ; GFX10-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[C1]](s32) @@ -2858,8 +2861,9 @@ ; GFX10-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB8]], [[SELECT2]] ; GFX10-NEXT: [[XOR5:%[0-9]+]]:_(s32) = G_XOR [[SELECT3]], [[ASHR2]] ; GFX10-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[XOR5]], [[ASHR2]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB4]](s32), [[SUB9]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB9]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SREM %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir @@ -487,10 +487,8 @@ ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s16) = G_SELECT [[ICMP2]](s1), [[C1]], [[C2]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[TRUNC1]](s16), [[ASHR1]] ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT1]](s16), [[SELECT3]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_SSHLSAT %0, %1 @@ -672,15 +670,15 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT5]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR3]](s32), [[BITCAST4]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT1]](s16), [[SELECT3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT5]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 @@ -878,13 +876,9 @@ ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s16) = G_SELECT [[ICMP6]](s1), [[C1]], [[C2]] ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[TRUNC3]](s16), [[ASHR3]] ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s16) = G_SELECT [[ICMP7]](s1), [[SELECT6]], [[SHL3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT5]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT7]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT1]](s16), [[SELECT3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT5]](s16), [[SELECT7]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -249,23 +249,26 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[SHL]], [[SHL1]] - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SSUBSAT]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SSUBSAT]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[ASHR]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -587,31 +590,43 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[SSUBSAT:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[SSUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_SSUBSAT [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[SSUBSAT]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SSUBSAT1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -6646,22 +6646,30 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<12 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<12 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s16>) ; GFX9-NEXT: G_STORE [[BITCAST6]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sub.mir @@ -236,22 +236,27 @@ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY6]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[COPY4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY4]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[SUB]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[SUB1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC6]](s16), implicit [[TRUNC7]](s16), implicit [[TRUNC8]](s16) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir @@ -194,23 +194,26 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[SHL]], [[SHL1]] - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UADDSAT]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UADDSAT]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR2]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -457,31 +460,43 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[UADDSAT:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UADDSAT1:%[0-9]+]]:_(<2 x s16>) = G_UADDSAT [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UADDSAT]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UADDSAT1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir @@ -2433,6 +2433,7 @@ ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[AND1]] ; GFX9-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]] ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT2]](s32) ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32) @@ -2454,8 +2455,9 @@ ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]] ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]] ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT2]](s32), [[SELECT5]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_udiv_v2s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -2491,6 +2493,7 @@ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[AND1]] ; GFX10-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]] ; GFX10-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT2]](s32) ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32) @@ -2512,8 +2515,9 @@ ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]] ; GFX10-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]] ; GFX10-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT2]](s32), [[SELECT5]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT5]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UDIV %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir @@ -438,26 +438,32 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UMAX]](<2 x s16>) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UMAX1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR4]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMAX %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir @@ -438,26 +438,32 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF2]](s32) - ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF2]](s16) + ; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UMIN]](<2 x s16>) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UMIN1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST4]](s32), [[LSHR2]](s32), [[BITCAST5]](s32) + ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR4]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_UMIN %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir @@ -400,11 +400,8 @@ ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C]] ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND2]], [[AND3]] ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] - ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C]] + ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]] + ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -482,20 +479,14 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[AND1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND]](s16), [[AND1]](s16) ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]] - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[AND2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[AND3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND2]](s16), [[AND3]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[C2]](s32) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C]] ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[COPY5]](s32) @@ -503,8 +494,8 @@ ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[AND4]], [[AND5]] ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[MUL1]], [[C1]](s16) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR]](<2 x s16>) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C]] @@ -519,7 +510,7 @@ ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL1]] ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) %0:_(s32) = COPY $vgpr0 @@ -589,31 +580,25 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[AND1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND]](s16), [[AND1]](s16) ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]] - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[AND2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[AND3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND2]](s16), [[AND3]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C2]](s32) - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR]](<2 x s16>) ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C]] ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C]] ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C1]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL]] - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32) + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -717,47 +702,36 @@ ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[AND1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND]](s16), [[AND1]](s16) ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[AND2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[AND3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND2]](s16), [[AND3]](s16) + ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C4]](s16), [[C4]](s16) + ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[AND4]](s16) - ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[AND5]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT4]](s32), [[ANYEXT5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND4]](s16), [[AND5]](s16) ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[AND6]](s16) - ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[AND7]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT6]](s32), [[ANYEXT7]](s32) - ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL1]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[AND6]](s16), [[AND7]](s16) + ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(<2 x s16>) = G_MUL [[BUILD_VECTOR3]], [[BUILD_VECTOR4]] + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C4]](s16), [[C4]](s16) + ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[MUL1]], [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR6]](<2 x s16>) ; GFX9-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR7]](<2 x s16>) ; GFX9-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C4]] - ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C4]] + ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C5]] + ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C5]] ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]] - ; GFX9-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]] + ; GFX9-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]] ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32) ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; GFX9-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C4]] + ; GFX9-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C5]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir @@ -594,18 +594,14 @@ ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[AND4]] ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[AND5]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[MUL]](s32), [[MUL1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1) ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1) ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[BUILD_VECTOR_TRUNC]](<2 x s16>) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C]] - ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]] + ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]] + ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]] ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>) ; GFX9-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir @@ -2332,6 +2332,7 @@ ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT1]](s32) ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32) @@ -2351,8 +2352,9 @@ ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]] ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]] ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]] - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT1]](s32), [[SELECT3]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; GFX10-LABEL: name: test_urem_v2s16 ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -2385,6 +2387,7 @@ ; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] ; GFX10-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] ; GFX10-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT1]](s32) ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]] ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32) @@ -2404,8 +2407,9 @@ ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]] ; GFX10-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]] ; GFX10-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]] - ; GFX10-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SELECT1]](s32), [[SELECT3]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT3]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_UREM %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir @@ -415,10 +415,8 @@ ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[SHL1]], [[TRUNC3]](s16) ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[TRUNC1]](s16), [[LSHR3]] ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[C1]], [[SHL1]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_USHLSAT %0, %1 @@ -576,15 +574,15 @@ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT2]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR6]](s32), [[BITCAST4]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT2]](s16), [[TRUNC6]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC7]](s16), [[TRUNC8]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 @@ -752,13 +750,9 @@ ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[TRUNC7]](s16) ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[TRUNC3]](s16), [[LSHR7]] ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s16) = G_SELECT [[ICMP3]](s1), [[C1]], [[SHL3]] - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16) - ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) - ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT2]](s16) - ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT3]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT2]](s32), [[ANYEXT3]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT2]](s16), [[SELECT3]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir @@ -187,23 +187,26 @@ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[LSHR1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY2]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[SHL]], [[SHL1]] - ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[USUBSAT]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[USUBSAT]], [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LSHR2]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C2]](s16) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) @@ -441,31 +444,43 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<6 x s16>) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR1]](s32), [[BITCAST2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[USUBSAT:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[USUBSAT1:%[0-9]+]]:_(<2 x s16>) = G_USUBSAT [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[USUBSAT]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[USUBSAT1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[BITCAST5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[LSHR4]](s32), [[BITCAST6]](s32) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[TRUNC9]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s16>), %2:_(<3 x s16>) = G_UNMERGE_VALUES %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx8-plus.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx8-plus.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx8-plus.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-vector-args-gfx8-plus.mir @@ -110,28 +110,37 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[DEF]](s16) + ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s16>) = G_ADD [[BUILD_VECTOR1]], [[BUILD_VECTOR3]] ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[ADD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[ADD1]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[DEF]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC4]](<2 x s16>) - ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC8]](s16), [[DEF]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR4]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR5]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %3:_(<2 x s16>) = COPY $vgpr0 %4:_(<2 x s16>) = COPY $vgpr1 @@ -201,28 +210,33 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC1]](<2 x s16>) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC4]](s16) + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[TRUNC5]](s16) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[SHL]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[DEF]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC2]](<2 x s16>) - ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SHL1]](s16), [[DEF]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR2]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR3]](<2 x s16>) ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %3:_(<2 x s16>) = COPY $vgpr0 %4:_(<2 x s16>) = COPY $vgpr1 @@ -408,46 +422,51 @@ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) ; GFX9-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) ; GFX9-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST]](s32), [[LSHR]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST1]](s32), [[LSHR1]](s32) - ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST2]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST3]](s32), [[LSHR2]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST4]](s32), [[LSHR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST5]](s32), [[DEF]](s32) - ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC]] - ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC3]] - ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] - ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC1]] - ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC4]] - ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] - ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC2]] - ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[BUILD_VECTOR_TRUNC5]] - ; GFX9-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] - ; GFX9-NEXT: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>) - ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE1]](<2 x s16>) - ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) - ; GFX9-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE2]](<2 x s16>) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST6]](s32), [[LSHR4]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST7]](s32), [[LSHR5]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[BITCAST8]](s32), [[DEF]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC6]](<2 x s16>) - ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC7]](<2 x s16>) - ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC]] + ; GFX9-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC5]] + ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]] + ; GFX9-NEXT: [[FCANONICALIZE2:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC1]] + ; GFX9-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC6]] + ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]] + ; GFX9-NEXT: [[FCANONICALIZE4:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC2]] + ; GFX9-NEXT: [[FCANONICALIZE5:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC7]] + ; GFX9-NEXT: [[FMAXNUM_IEEE2:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE4]], [[FCANONICALIZE5]] + ; GFX9-NEXT: [[FCANONICALIZE6:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC3]] + ; GFX9-NEXT: [[FCANONICALIZE7:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC8]] + ; GFX9-NEXT: [[FMAXNUM_IEEE3:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE6]], [[FCANONICALIZE7]] + ; GFX9-NEXT: [[FCANONICALIZE8:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC4]] + ; GFX9-NEXT: [[FCANONICALIZE9:%[0-9]+]]:_(s16) = G_FCANONICALIZE [[TRUNC9]] + ; GFX9-NEXT: [[FMAXNUM_IEEE4:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE8]], [[FCANONICALIZE9]] + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE]](s16), [[FMAXNUM_IEEE1]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE2]](s16), [[FMAXNUM_IEEE3]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMAXNUM_IEEE4]](s16), [[DEF]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %2:_(<2 x s16>) = COPY $vgpr0 %3:_(<2 x s16>) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -448,8 +448,7 @@ define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i32_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -458,19 +457,19 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -487,38 +486,33 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i32_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -530,38 +524,33 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i32_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -573,8 +562,7 @@ define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_1darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -583,19 +571,19 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -612,38 +600,33 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i32_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -655,38 +638,33 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2dmsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 -; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 +; GFX10-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -698,10 +676,9 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i32_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -710,20 +687,20 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1217,8 +1194,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t) { ; GFX9-LABEL: atomic_add_i64_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -1227,19 +1203,19 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1256,38 +1232,33 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: atomic_add_i64_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1299,38 +1270,33 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %face) { ; GFX9-LABEL: atomic_add_i64_cube: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1342,8 +1308,7 @@ define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_1darray: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -1352,19 +1317,19 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 @@ -1381,38 +1346,33 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice) { ; GFX9-LABEL: atomic_add_i64_2darray: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1424,38 +1384,33 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2dmsaa: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 -; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 +; GFX10-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1467,10 +1422,9 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: atomic_add_i64_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -1479,20 +1433,20 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, v5 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -9,8 +9,7 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -22,7 +21,7 @@ ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -33,7 +32,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -45,7 +44,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -61,24 +60,21 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -87,23 +83,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -117,24 +111,21 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da +; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -143,23 +134,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -173,8 +162,7 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 @@ -186,7 +174,7 @@ ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -197,7 +185,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 @@ -209,7 +197,7 @@ ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -225,24 +213,21 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -251,23 +236,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -281,24 +264,23 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -307,23 +289,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -337,22 +317,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -363,21 +340,19 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -393,22 +368,19 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -419,21 +391,19 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -449,25 +419,23 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] -; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; @@ -476,24 +444,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -507,23 +472,21 @@ ; GFX9-NEXT: s_mov_b64 s[14:15], exec ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v5, s12 -; GFX9-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: v_lshl_or_b32 v2, v5, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -534,24 +497,21 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 +; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 -; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12 +; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 -; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -562,46 +522,41 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 -; GFX9-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -612,46 +567,43 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_lshl_b32 s12, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: s_mov_b32 s8, s10 ; GFX9-NEXT: s_mov_b32 s9, s11 +; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 -; GFX9-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0 +; GFX9-NEXT: image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 -; GFX10NSA-NEXT: s_mov_b32 s2, s4 -; GFX10NSA-NEXT: s_mov_b32 s4, s6 -; GFX10NSA-NEXT: s_mov_b32 s6, s8 -; GFX10NSA-NEXT: s_mov_b32 s8, s10 -; GFX10NSA-NEXT: s_mov_b32 s10, s12 -; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 +; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 +; GFX10NSA-NEXT: s_mov_b32 s6, s8 ; GFX10NSA-NEXT: s_mov_b32 s7, s9 +; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 +; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX10NSA-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: @@ -662,8 +614,7 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -676,19 +627,19 @@ ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 @@ -708,8 +659,7 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -722,19 +672,19 @@ ; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_lz_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 -; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 ; GFX10NSA-NEXT: s_mov_b32 s6, s8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -545,13 +545,10 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f16_xyz: @@ -565,13 +562,10 @@ ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: load_1d_v3f16_xyz: @@ -585,13 +579,11 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX11-NEXT: s_lshl_b32 s0, s0, 16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -6,10 +6,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -18,20 +17,20 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 ; GFX10PLUS-NEXT: s_mov_b32 s2, s4 -; GFX10PLUS-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 ; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 @@ -47,12 +46,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 @@ -79,16 +77,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -109,16 +107,16 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -145,12 +143,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 @@ -177,16 +174,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -207,16 +204,16 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 +; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -6,38 +6,33 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 -; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: load_3d_v4f32_xyzw: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-NEXT: s_mov_b32 s0, s2 -; GFX10PLUS-NEXT: s_mov_b32 s2, s4 -; GFX10PLUS-NEXT: s_mov_b32 s4, s6 -; GFX10PLUS-NEXT: s_mov_b32 s6, s8 -; GFX10PLUS-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10PLUS-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 -; GFX10PLUS-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 ; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 ; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX10PLUS-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX10PLUS-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -47,90 +42,85 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NEXT: v_mov_b32_e32 v4, v9 -; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe +; GFX9-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v5, v4, s[10:11] +; GFX9-NEXT: global_store_dword v7, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v9 -; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 -; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) @@ -143,90 +133,85 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i16 %s, i16 %t, i16 %r) { ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-NEXT: v_mov_b32_e32 v10, v7 +; GFX9-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: s_mov_b32 s2, s4 -; GFX9-NEXT: s_mov_b32 s4, s6 -; GFX9-NEXT: s_mov_b32 s6, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, v8 -; GFX9-NEXT: v_mov_b32_e32 v4, v9 -; GFX9-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf unorm a16 tfe lwe +; GFX9-NEXT: v_mov_b32_e32 v1, v8 +; GFX9-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-NEXT: v_mov_b32_e32 v4, v11 +; GFX9-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm a16 tfe lwe ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v5, v4, s[10:11] +; GFX9-NEXT: global_store_dword v7, v4, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v6, v5 -; GFX10-NEXT: v_mov_b32_e32 v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v5 -; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v8, v7 +; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v10, v7 +; GFX10-NEXT: v_mov_b32_e32 v11, v7 +; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v0, v5 -; GFX10-NEXT: v_mov_b32_e32 v1, v6 -; GFX10-NEXT: v_mov_b32_e32 v2, v7 -; GFX10-NEXT: v_mov_b32_e32 v3, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, v9 -; GFX10-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: v_mov_b32_e32 v4, v11 +; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v5, v4, s[10:11] +; GFX10-NEXT: global_store_dword v7, v4, s[10:11] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: s_mov_b32 s2, s4 -; GFX11-NEXT: s_mov_b32 s4, s6 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: s_mov_b32 s6, s8 -; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v9, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 -; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: v_mov_b32_e32 v9, v7 +; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 -; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 -; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v9 +; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] +; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.cd.g16.ll @@ -4,9 +4,6 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -18,10 +15,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -33,9 +30,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -47,10 +41,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -62,9 +56,6 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -76,10 +67,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -91,9 +82,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -106,12 +94,11 @@ ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -5,9 +5,6 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -19,10 +16,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -34,15 +31,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r) { ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -54,9 +48,6 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -68,10 +59,10 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -83,9 +74,6 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -97,10 +85,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -112,9 +100,6 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -127,12 +112,11 @@ ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -144,15 +128,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v10, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 +; GFX10-NEXT: v_mov_b32_e32 v9, v2 +; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -164,15 +147,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v3 -; GFX10-NEXT: v_mov_b32_e32 v10, v2 -; GFX10-NEXT: v_mov_b32_e32 v11, v4 +; GFX10-NEXT: v_mov_b32_e32 v9, v2 +; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 -; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -64,17 +64,17 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11 -; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9 -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshl_or_b32 v7, v5, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v8, v10, 16, v9 +; GFX11-NEXT: v_lshl_or_b32 v9, v6, 16, v11 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[7:9]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -131,17 +131,17 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12 -; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10 -; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshl_or_b32 v8, v6, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v9, v11, 16, v10 +; GFX11-NEXT: v_lshl_or_b32 v10, v7, 16, v12 +; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[8:10]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) @@ -345,18 +345,18 @@ ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v7 ; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 -; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_lshlrev_b32 v2, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_and_b32 v2, 0xffff, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v8 ; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v7, v2 -; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 +; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v3 +; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v3 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10 @@ -590,18 +590,18 @@ ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v14, v0 :: v_dual_mov_b32 v15, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v8 ; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v17, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_lshlrev_b32 v3, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_and_b32 v3, 0xffff, v9 ; GFX11-NEXT: v_mov_b32_e32 v19, v5 -; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v8, v2 +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 -; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v9, v3 +; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v3 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.mir +++ /dev/null @@ -1,91 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s - ---- -name: build_vector_trunc_v2s16_s32_ss -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $sgpr1 - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_ss - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $sgpr1 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_sv -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_sv - ; CHECK: liveins: $sgpr0, $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $vgpr0 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_vs -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $sgpr0 - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_vs - ; CHECK: liveins: $vgpr0, $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $sgpr0 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_vv -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_vv - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.v2s16.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector-trunc.v2s16.mir +++ /dev/null @@ -1,94 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s - ---- -name: build_vector_trunc_v2s16_s32_ss -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $sgpr1 - - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_ss - ; CHECK: liveins: $sgpr0, $sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY]](s32), [[COPY1]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $sgpr1 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_sv -legalized: true - -body: | - bb.0: - liveins: $sgpr0, $vgpr0 - - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_sv - ; CHECK: liveins: $sgpr0, $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $sgpr0 - %1:_(s32) = COPY $vgpr0 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_vs -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $sgpr0 - - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_vs - ; CHECK: liveins: $vgpr0, $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $sgpr0 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... - ---- -name: build_vector_trunc_v2s16_s32_vv -legalized: true - -body: | - bb.0: - liveins: $vgpr0, $vgpr1 - - ; CHECK-LABEL: name: build_vector_trunc_v2s16_s32_vv - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s32) = G_SHL [[COPY1]], [[C1]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:vgpr(s32) = G_OR [[AND]], [[SHL]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(<2 x s16>) = G_BITCAST [[OR]](s32) - %0:_(s32) = COPY $vgpr0 - %1:_(s32) = COPY $vgpr1 - %2:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 -... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -252,9 +252,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v1, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v2f16: @@ -262,8 +261,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rndne_f16_e32 v1, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_roundeven_v2f16: @@ -273,8 +273,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_rndne_f16_e32 v0, v0 ; GFX11-NEXT: v_rndne_f16_e32 v1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) ret <2 x half> %roundeven @@ -329,9 +329,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX9-NEXT: v_rndne_f16_e32 v1, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v2f16_fneg: @@ -340,8 +339,9 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX10-NEXT: v_rndne_f16_e32 v1, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_roundeven_v2f16_fneg: @@ -352,8 +352,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_rndne_f16_e32 v0, v0 ; GFX11-NEXT: v_rndne_f16_e32 v1, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) @@ -413,12 +413,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_rndne_f16_e32 v2, v0 -; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rndne_f16_e32 v3, v1 -; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_v4f16: @@ -426,11 +425,13 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rndne_f16_e32 v2, v0 -; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_rndne_f16_e32 v3, v1 -; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 +; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_roundeven_v4f16: @@ -443,10 +444,10 @@ ; GFX11-NEXT: v_rndne_f16_e32 v1, v1 ; GFX11-NEXT: v_rndne_f16_e32 v2, v2 ; GFX11-NEXT: v_rndne_f16_e32 v3, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -298,12 +298,12 @@ ; GFX9-LABEL: v_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -317,12 +317,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -337,10 +338,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -632,72 +633,71 @@ ; GFX9-LABEL: v_saddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_add_i16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX9-NEXT: v_pk_add_i16 v1, v2, v3 clamp -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_saddsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_i16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_v4i8: @@ -705,35 +705,35 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff, v4, v5 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_add_i16 v1, v2, v3 clamp +; GFX11-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_i16 v1, v3, v1 clamp ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -896,19 +896,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 @@ -940,21 +940,21 @@ ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 @@ -981,29 +981,29 @@ ; GFX11-LABEL: s_saddsat_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s1, s4 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_add_i16 v0, s2, s4 clamp +; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -2904,19 +2904,20 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: s_xor_b32 s4, s11, s7 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 -; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] @@ -2937,7 +2938,6 @@ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2945,14 +2945,15 @@ ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX10-NEXT: s_sext_i32_i16 s6, s0 ; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 @@ -2961,43 +2962,45 @@ ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: s_xor_b32 s1, s9, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 ; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 +; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 -; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 -; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -298,12 +298,12 @@ ; GFX9-LABEL: v_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -317,12 +317,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -337,10 +338,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -632,72 +633,71 @@ ; GFX9-LABEL: v_ssubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX9-NEXT: v_pk_sub_i16 v1, v2, v3 clamp -; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ssubsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_i16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_v4i8: @@ -705,35 +705,35 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff, v4, v5 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_sub_i16 v1, v2, v3 clamp +; GFX11-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_i16 v1, v3, v1 clamp ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -896,19 +896,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 @@ -940,21 +940,21 @@ ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 @@ -981,29 +981,29 @@ ; GFX11-LABEL: s_ssubsat_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s1, s4 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_sub_i16 v0, s2, s4 clamp +; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 clamp ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_i16 v1, s0, s1 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -223,12 +223,12 @@ ; GFX9-LABEL: v_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -242,12 +242,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -262,10 +263,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -472,72 +473,71 @@ ; GFX9-LABEL: v_uaddsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX9-NEXT: v_pk_add_u16 v1, v2, v3 clamp -; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uaddsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 clamp ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_uaddsat_v4i8: @@ -545,35 +545,35 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff, v4, v5 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_add_u16 v1, v2, v3 clamp +; GFX11-NEXT: v_pk_add_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_u16 v1, v3, v1 clamp ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -667,19 +667,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 @@ -711,21 +711,21 @@ ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 @@ -752,29 +752,29 @@ ; GFX11-LABEL: s_uaddsat_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s1, s4 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_add_u16 v0, s2, s4 clamp +; GFX11-NEXT: v_pk_add_u16 v0, s2, s3 clamp ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_u16 v1, s0, s1 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -2319,13 +2319,12 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: global_store_dword v2, v1, s[6:7] @@ -2335,8 +2334,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_and_b32 s2, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX10-NEXT: s_sub_i32 s3, 0, s2 @@ -2350,8 +2349,8 @@ ; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 ; GFX10-NEXT: s_sub_i32 s3, 0, s1 ; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s3, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 @@ -2367,26 +2366,26 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -217,12 +217,12 @@ ; GFX9-LABEL: v_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -236,12 +236,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -256,10 +257,10 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -460,72 +461,71 @@ ; GFX9-LABEL: v_usubsat_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 8 -; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_sub_u16 v2, v2, v3 clamp ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX9-NEXT: v_pk_sub_u16 v1, v2, v3 clamp -; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v2, 8 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0xff -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_usubsat_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 -; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff, v0 +; GFX10-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v7 +; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v8 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp -; GFX10-NEXT: v_mov_b32_e32 v2, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, v2, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_usubsat_v4i8: @@ -533,35 +533,35 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 -; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 -; GFX11-NEXT: v_and_or_b32 v2, 0xffff, v4, v5 -; GFX11-NEXT: v_and_or_b32 v3, 0xffff, v7, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v2, v5, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX11-NEXT: v_pk_sub_u16 v1, v2, v3 clamp +; GFX11-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_sub_u16 v1, v3, v1 clamp ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 -; GFX11-NEXT: v_or3_b32 v0, v0, v3, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> @@ -651,19 +651,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 -; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 @@ -695,21 +695,21 @@ ; GFX10-NEXT: s_lshr_b32 s2, s0, 8 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 @@ -736,29 +736,29 @@ ; GFX11-LABEL: s_usubsat_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s2, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s1, s4 -; GFX11-NEXT: s_lshr_b32 s6, s2, 16 -; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX11-NEXT: s_lshr_b32 s6, s4, 16 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshr_b32 s5, s1, 24 -; GFX11-NEXT: s_lshl_b32 s4, s4, 0x80008 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 ; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 ; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 -; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s6 -; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s1, 16 -; GFX11-NEXT: v_pk_sub_u16 v0, s2, s4 clamp +; GFX11-NEXT: v_pk_sub_u16 v0, s2, s3 clamp ; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 ; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX11-NEXT: s_lshl_b32 s2, s5, 8 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_sub_u16 v1, s0, s1 clamp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -41,6 +41,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i1_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i1, i1 addrspace(1)* %ptr %ext = zext i1 %load to i32 ret i32 %ext @@ -78,6 +87,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i8_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i8, i8 addrspace(1)* %ptr %ext = zext i8 %load to i32 ret i32 %ext @@ -115,6 +132,14 @@ ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i16_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i16, i16 addrspace(1)* %ptr %ext = zext i16 %load to i32 ret i32 %ext diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -13,8 +13,6 @@ ; ; GFX10GISEL-LABEL: sample_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -34,8 +32,8 @@ ; ; GFX10GISEL-LABEL: sample_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -62,10 +60,10 @@ ; ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -83,8 +81,6 @@ ; ; GFX10GISEL-LABEL: sample_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -104,8 +100,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -125,8 +121,8 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -151,10 +147,10 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -174,8 +170,8 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -201,10 +197,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -222,8 +218,6 @@ ; ; GFX10GISEL-LABEL: sample_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -243,8 +237,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -262,8 +256,6 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, s12 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -283,8 +275,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -304,8 +296,8 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -330,10 +322,10 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -353,8 +345,8 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v3, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -380,10 +372,10 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6 +; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -410,10 +402,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -440,10 +432,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7 +; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -483,9 +475,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -507,10 +496,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -534,15 +523,12 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX10GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -560,9 +546,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -584,10 +567,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -605,9 +588,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -629,10 +609,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -650,9 +630,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -677,12 +654,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -700,9 +676,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -724,10 +697,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -745,9 +718,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -769,10 +739,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -790,9 +760,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -814,10 +781,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -835,9 +802,6 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -862,12 +826,11 @@ ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -893,15 +856,14 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -927,15 +889,14 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v2 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 +; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 -; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 +; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1 +; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -31,11 +31,10 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_movk_i32 s0, 0x4000 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 -; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -82,11 +81,10 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_movk_i32 s0, 0x4000 ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1 -; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -134,8 +132,8 @@ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GISEL-NEXT: s_endpgm @@ -174,12 +172,12 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff -; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 -; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -228,13 +226,12 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_mov_b32 s0, 0x8000 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 ; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0 -; GISEL-NEXT: v_sub_f16_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_sub_f16_e32 v1, 0x8000, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND