Index: llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -34,12 +34,24 @@ PM_SGPR96 = 23, PM_VGPR96 = 24, PM_AGPR96 = 25, - PM_AGPR32 = 31, - PM_AGPR64 = 32, - PM_AGPR128 = 33, - PM_AGPR256 = 34, - PM_AGPR512 = 35, - PM_AGPR1024 = 36 + PM_SGPR288 = 26, + PM_VGPR288 = 27, + PM_AGPR288 = 28, + PM_SGPR320 = 29, + PM_VGPR320 = 30, + PM_AGPR320 = 31, + PM_SGPR352 = 32, + PM_VGPR352 = 33, + PM_AGPR352 = 34, + PM_SGPR384 = 35, + PM_VGPR384 = 36, + PM_AGPR384 = 37, + PM_AGPR32 = 38, + PM_AGPR64 = 39, + PM_AGPR128 = 40, + PM_AGPR256 = 41, + PM_AGPR512 = 42, + PM_AGPR1024 = 43 }; const RegisterBankInfo::PartialMapping PartMappings[] { @@ -66,6 +78,18 @@ {0, 96, SGPRRegBank}, {0, 96, VGPRRegBank}, {0, 96, AGPRRegBank}, + {0, 288, SGPRRegBank}, + {0, 288, VGPRRegBank}, + {0, 288, AGPRRegBank}, + {0, 320, SGPRRegBank}, + {0, 320, VGPRRegBank}, + {0, 320, AGPRRegBank}, + {0, 352, SGPRRegBank}, + {0, 352, VGPRRegBank}, + {0, 352, AGPRRegBank}, + {0, 384, SGPRRegBank}, + {0, 384, VGPRRegBank}, + {0, 384, AGPRRegBank}, {0, 32, AGPRRegBank}, // AGPR begin {0, 64, AGPRRegBank}, @@ -107,6 +131,18 @@ {&PartMappings[17], 1}, {&PartMappings[18], 1}, {&PartMappings[19], 1}, + {&PartMappings[20], 1}, + {&PartMappings[21], 1}, + {&PartMappings[22], 1}, + {&PartMappings[23], 1}, + {&PartMappings[24], 1}, + {&PartMappings[25], 1}, + {&PartMappings[26], 1}, + {&PartMappings[27], 1}, + {&PartMappings[28], 1}, + {&PartMappings[29], 1}, + {&PartMappings[30], 1}, + {&PartMappings[31], 1}, // AGPRs {nullptr, 0}, @@ -114,12 +150,12 @@ {nullptr, 0}, {nullptr, 0}, {nullptr, 0}, - {&PartMappings[20], 1}, // 32 - {&PartMappings[21], 1}, // 64 - {&PartMappings[22], 1}, // 128 - {&PartMappings[23], 1}, // 256 - {&PartMappings[24], 1}, // 512 - {&PartMappings[25], 1} // 1024 + {&PartMappings[32], 1}, // 32 + {&PartMappings[33], 1}, // 64 + {&PartMappings[34], 1}, // 128 + {&PartMappings[35], 1}, // 256 + {&PartMappings[36], 1}, // 512 + {&PartMappings[37], 1} // 1024 }; const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] { @@ -148,7 +184,7 @@ enum ValueMappingIdx { SGPRStartIdx = 1, VGPRStartIdx = 12, - AGPRStartIdx = 26 + AGPRStartIdx = 38 }; const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, @@ -175,6 +211,62 @@ default: llvm_unreachable("Invalid register bank"); } break; + case 288: + switch (BankID) { + case AMDGPU::VGPRRegBankID: + Idx = PM_VGPR288; + break; + case AMDGPU::SGPRRegBankID: + Idx = PM_SGPR288; + break; + case AMDGPU::AGPRRegBankID: + Idx = PM_AGPR288; + break; + default: llvm_unreachable("Invalid register bank"); + } + break; + case 320: + switch (BankID) { + case AMDGPU::VGPRRegBankID: + Idx = PM_VGPR320; + break; + case AMDGPU::SGPRRegBankID: + Idx = PM_SGPR320; + break; + case AMDGPU::AGPRRegBankID: + Idx = PM_AGPR320; + break; + default: llvm_unreachable("Invalid register bank"); + } + break; + case 352: + switch (BankID) { + case AMDGPU::VGPRRegBankID: + Idx = PM_VGPR352; + break; + case AMDGPU::SGPRRegBankID: + Idx = PM_SGPR352; + break; + case AMDGPU::AGPRRegBankID: + Idx = PM_AGPR352; + break; + default: llvm_unreachable("Invalid register bank"); + } + break; + case 384: + switch (BankID) { + case AMDGPU::VGPRRegBankID: + Idx = PM_VGPR384; + break; + case AMDGPU::SGPRRegBankID: + Idx = PM_SGPR384; + break; + case AMDGPU::AGPRRegBankID: + Idx = PM_AGPR384; + break; + default: llvm_unreachable("Invalid register bank"); + } + break; default: switch (BankID) { case AMDGPU::VGPRRegBankID: Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -84,6 +84,18 @@ setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v9f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32); + + setOperationAction(ISD::LOAD, MVT::v10f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32); + + setOperationAction(ISD::LOAD, MVT::v11f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32); + + setOperationAction(ISD::LOAD, MVT::v12f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32); + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); @@ -196,6 +208,18 @@ setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); + setOperationAction(ISD::STORE, MVT::v9f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32); + + setOperationAction(ISD::STORE, MVT::v10f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32); + + setOperationAction(ISD::STORE, MVT::v11f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32); + + setOperationAction(ISD::STORE, MVT::v12f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32); + setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); @@ -325,19 +349,23 @@ setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::CONCAT_VECTORS, - {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, - MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, - MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32}, + {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, + MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, + MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, + MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, + MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, Custom); setOperationAction( ISD::EXTRACT_SUBVECTOR, {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32, - MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v16f16, MVT::v16i16, - MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, MVT::v2f64, - MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, MVT::v4i64, - MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, + MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32, + MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, + MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, + MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64, + MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64, + MVT::v16f64, MVT::v16i64}, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -384,7 +412,8 @@ MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32}; + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, + MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; for (MVT VT : VectorIntTypes) { // Expand the following operations for the current type by default. @@ -404,7 +433,8 @@ } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32}; + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, + MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32}; for (MVT VT : FloatVectorTypes) { setOperationAction( @@ -440,6 +470,18 @@ setOperationAction(ISD::SELECT, MVT::v7f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); + setOperationAction(ISD::SELECT, MVT::v9f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32); + + setOperationAction(ISD::SELECT, MVT::v10f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32); + + setOperationAction(ISD::SELECT, MVT::v11f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32); + + setOperationAction(ISD::SELECT, MVT::v12f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast(I), nullptr); @@ -1064,7 +1106,9 @@ // Round up vec3/vec5 argument. if (MemVT.isVector() && !MemVT.isPow2VectorType()) { assert(MemVT.getVectorNumElements() == 3 || - MemVT.getVectorNumElements() == 5); + MemVT.getVectorNumElements() == 5 || + (MemVT.getVectorNumElements() >= 9 && + MemVT.getVectorNumElements() <= 12)); MemVT = MemVT.getPow2VectorType(State.getContext()); } else if (!MemVT.isSimple() && !MemVT.isVector()) { MemVT = MemVT.getRoundIntegerType(State.getContext()); Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5446,7 +5446,7 @@ Opcode = AMDGPU::getMIMGOpcode( BaseOpcodes[Is64][IsA16], IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, - NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); + NumVDataDwords, NumVAddrDwords); } assert(Opcode != -1); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,16 +7,16 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024] + [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_288, SReg_320, SReg_352, SReg_384, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024] + [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384, VReg_512, VReg_1024] >; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; def AGPRRegBank : RegisterBank <"AGPR", - [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024] + [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_288, AReg_320, AReg_352, AReg_384, AReg_512, AReg_1024] >; Index: llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -431,6 +431,46 @@ IsSGPR = false; IsAGPR = true; Width = 8; + } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { + IsSGPR = false; + Width = 9; + } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { + IsSGPR = true; + Width = 9; + } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 9; + } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { + IsSGPR = false; + Width = 10; + } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { + IsSGPR = true; + Width = 10; + } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 10; + } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { + IsSGPR = false; + Width = 11; + } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { + IsSGPR = true; + Width = 11; + } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 11; + } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { + IsSGPR = false; + Width = 12; + } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { + IsSGPR = true; + Width = 12; + } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 12; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && "trap handler registers should not be used"); Index: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2360,6 +2360,14 @@ return AMDGPU::VReg_224RegClassID; case 256: return AMDGPU::VReg_256RegClassID; + case 288: + return AMDGPU::VReg_288RegClassID; + case 320: + return AMDGPU::VReg_320RegClassID; + case 352: + return AMDGPU::VReg_352RegClassID; + case 384: + return AMDGPU::VReg_384RegClassID; case 512: return AMDGPU::VReg_512RegClassID; case 1024: @@ -2398,6 +2406,14 @@ return AMDGPU::SGPR_224RegClassID; case 256: return AMDGPU::SGPR_256RegClassID; + case 288: + return AMDGPU::SGPR_288RegClassID; + case 320: + return AMDGPU::SGPR_320RegClassID; + case 352: + return AMDGPU::SGPR_352RegClassID; + case 384: + return AMDGPU::SGPR_384RegClassID; case 512: return AMDGPU::SGPR_512RegClassID; } @@ -2420,6 +2436,14 @@ return AMDGPU::AReg_224RegClassID; case 256: return AMDGPU::AReg_256RegClassID; + case 288: + return AMDGPU::AReg_288RegClassID; + case 320: + return AMDGPU::AReg_320RegClassID; + case 352: + return AMDGPU::AReg_352RegClassID; + case 384: + return AMDGPU::AReg_384RegClassID; case 512: return AMDGPU::AReg_512RegClassID; case 1024: @@ -3684,7 +3708,7 @@ AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); if (!IsNSA) { - if (ExpectedAddrSize > 8) + if (ExpectedAddrSize > 12) ExpectedAddrSize = 16; // Allow oversized 8 VGPR vaddr when only 5/6/7 VGPRs are required. Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -182,6 +182,10 @@ MCOperand decodeOperand_VReg_96(unsigned Val) const; MCOperand decodeOperand_VReg_128(unsigned Val) const; MCOperand decodeOperand_VReg_256(unsigned Val) const; + MCOperand decodeOperand_VReg_288(unsigned Val) const; + MCOperand decodeOperand_VReg_320(unsigned Val) const; + MCOperand decodeOperand_VReg_352(unsigned Val) const; + MCOperand decodeOperand_VReg_384(unsigned Val) const; MCOperand decodeOperand_VReg_512(unsigned Val) const; MCOperand decodeOperand_VReg_1024(unsigned Val) const; @@ -193,12 +197,20 @@ MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_128(unsigned Val) const; MCOperand decodeOperand_SReg_256(unsigned Val) const; + MCOperand decodeOperand_SReg_288(unsigned Val) const; + MCOperand decodeOperand_SReg_320(unsigned Val) const; + MCOperand decodeOperand_SReg_352(unsigned Val) const; + MCOperand decodeOperand_SReg_384(unsigned Val) const; MCOperand decodeOperand_SReg_512(unsigned Val) const; MCOperand decodeOperand_AGPR_32(unsigned Val) const; MCOperand decodeOperand_AReg_64(unsigned Val) const; MCOperand decodeOperand_AReg_128(unsigned Val) const; MCOperand decodeOperand_AReg_256(unsigned Val) const; + MCOperand decodeOperand_AReg_288(unsigned Val) const; + MCOperand decodeOperand_AReg_320(unsigned Val) const; + MCOperand decodeOperand_AReg_352(unsigned Val) const; + MCOperand decodeOperand_AReg_384(unsigned Val) const; MCOperand decodeOperand_AReg_512(unsigned Val) const; MCOperand decodeOperand_AReg_1024(unsigned Val) const; MCOperand decodeOperand_AV_32(unsigned Val) const; @@ -214,6 +226,10 @@ OPW128, OPW160, OPW256, + OPW288, + OPW320, + OPW352, + OPW384, OPW512, OPW1024, OPW16, Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -129,6 +129,9 @@ DECODE_OPERAND_REG(VReg_96) DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(VReg_256) +DECODE_OPERAND_REG(VReg_288) +DECODE_OPERAND_REG(VReg_352) +DECODE_OPERAND_REG(VReg_384) DECODE_OPERAND_REG(VReg_512) DECODE_OPERAND_REG(VReg_1024) @@ -932,7 +935,7 @@ IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA || Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA; if (!IsNSA) { - if (AddrSize > 8) + if (AddrSize > 12) AddrSize = 16; } else { if (AddrSize > Info->VAddrDwords) { @@ -1142,6 +1145,14 @@ case AMDGPU::TTMP_256RegClassID: // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in // this bundle? + case AMDGPU::SGPR_288RegClassID: + case AMDGPU::TTMP_288RegClassID: + case AMDGPU::SGPR_320RegClassID: + case AMDGPU::TTMP_320RegClassID: + case AMDGPU::SGPR_352RegClassID: + case AMDGPU::TTMP_352RegClassID: + case AMDGPU::SGPR_384RegClassID: + case AMDGPU::TTMP_384RegClassID: case AMDGPU::SGPR_512RegClassID: case AMDGPU::TTMP_512RegClassID: shift = 2; @@ -1217,6 +1228,23 @@ return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255); } +MCOperand AMDGPUDisassembler::decodeOperand_AReg_288(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_288RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_320(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_320RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_352(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_352RegClassID, Val & 255); +} + +MCOperand AMDGPUDisassembler::decodeOperand_AReg_384(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_384RegClassID, Val & 255); +} + + MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); } @@ -1265,6 +1293,22 @@ return createRegOperand(AMDGPU::VReg_256RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VReg_288(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_288RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_320(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_320RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_352(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_352RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_384(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_384RegClassID, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const { return createRegOperand(AMDGPU::VReg_512RegClassID, Val); } @@ -1315,6 +1359,22 @@ return decodeDstOp(OPW256, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_SReg_288(unsigned Val) const { + return decodeDstOp(OPW288, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_320(unsigned Val) const { + return decodeDstOp(OPW320, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_352(unsigned Val) const { + return decodeDstOp(OPW352, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_384(unsigned Val) const { + return decodeDstOp(OPW384, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { return decodeDstOp(OPW512, Val); } @@ -1473,6 +1533,10 @@ case OPW128: return VReg_128RegClassID; case OPW160: return VReg_160RegClassID; case OPW256: return VReg_256RegClassID; + case OPW288: return VReg_288RegClassID; + case OPW320: return VReg_320RegClassID; + case OPW352: return VReg_352RegClassID; + case OPW384: return VReg_384RegClassID; case OPW512: return VReg_512RegClassID; case OPW1024: return VReg_1024RegClassID; } @@ -1494,6 +1558,10 @@ case OPW128: return AReg_128RegClassID; case OPW160: return AReg_160RegClassID; case OPW256: return AReg_256RegClassID; + case OPW288: return AReg_288RegClassID; + case OPW320: return AReg_320RegClassID; + case OPW352: return AReg_352RegClassID; + case OPW384: return AReg_384RegClassID; case OPW512: return AReg_512RegClassID; case OPW1024: return AReg_1024RegClassID; } @@ -1516,6 +1584,10 @@ case OPW128: return SGPR_128RegClassID; case OPW160: return SGPR_160RegClassID; case OPW256: return SGPR_256RegClassID; + case OPW288: return SGPR_288RegClassID; + case OPW320: return SGPR_320RegClassID; + case OPW352: return SGPR_352RegClassID; + case OPW384: return SGPR_384RegClassID; case OPW512: return SGPR_512RegClassID; } } @@ -1534,6 +1606,10 @@ case OPWV232: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; case OPW256: return TTMP_256RegClassID; + case OPW288: return TTMP_288RegClassID; + case OPW320: return TTMP_320RegClassID; + case OPW352: return TTMP_352RegClassID; + case OPW384: return TTMP_384RegClassID; case OPW512: return TTMP_512RegClassID; } } Index: llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -503,6 +503,10 @@ MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; Index: llvm/lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -958,7 +958,11 @@ !if(!eq(NumWords, 6), VReg_192, !if(!eq(NumWords, 7), VReg_224, !if(!le(NumWords, 8), VReg_256, - !if(!le(NumWords, 16), VReg_512, ?)))))))))); + !if(!le(NumWords, 9), VReg_288, + !if(!le(NumWords, 10), VReg_320, + !if(!le(NumWords, 11), VReg_352, + !if(!le(NumWords, 12), VReg_384, + !if(!le(NumWords, 16), VReg_512, ?)))))))))))))); // Whether the instruction variant with this vaddr size should be enabled for // the auto-generated disassembler. @@ -1007,8 +1011,8 @@ !foreach(range, // V4 is generated for V3 and V4 // V8 is generated for V5 through V8 - // V16 is generated for V9 through V16 - [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9,16]], + // V16 is generated for V13 through V16 + [[1],[2],[3],[3,4],[5],[6],[7],[5,8],[9],[10],[11],[12],[13,16]], MIMGAddrSizes_dw_range), lhs, dw, !if(isRangeInList.ret, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -120,6 +120,18 @@ addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); + addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); + addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); + + addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); + addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); + + addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); + addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); + + addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); + addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); + addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); @@ -158,15 +170,17 @@ // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, - {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, - MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1, - MVT::v32i32}, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, + MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, + MVT::i1, MVT::v32i32}, Custom); setOperationAction(ISD::STORE, - {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, - MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1, - MVT::v32i32}, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, + MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, + MVT::i1, MVT::v32i32}, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); @@ -209,12 +223,14 @@ AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); setOperationAction(ISD::TRUNCATE, - {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, - MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32}, + {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, + MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, + MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, Expand); setOperationAction(ISD::FP_ROUND, - {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, - MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32}, + {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, + MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, + MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, @@ -240,11 +256,13 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, - MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, - MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, - MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, - MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -365,8 +383,10 @@ // Deal with vec5/6/7 vector operations when widened to vec8. setOperationAction(ISD::INSERT_SUBVECTOR, - {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, - MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32}, + {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, + MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, + MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, + MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, Custom); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, @@ -4235,6 +4255,10 @@ case AMDGPU::SI_INDIRECT_SRC_V2: case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V9: + case AMDGPU::SI_INDIRECT_SRC_V10: + case AMDGPU::SI_INDIRECT_SRC_V11: + case AMDGPU::SI_INDIRECT_SRC_V12: case AMDGPU::SI_INDIRECT_SRC_V16: case AMDGPU::SI_INDIRECT_SRC_V32: return emitIndirectSrc(MI, *BB, *getSubtarget()); @@ -4242,6 +4266,10 @@ case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V9: + case AMDGPU::SI_INDIRECT_DST_V10: + case AMDGPU::SI_INDIRECT_DST_V11: + case AMDGPU::SI_INDIRECT_DST_V12: case AMDGPU::SI_INDIRECT_DST_V16: case AMDGPU::SI_INDIRECT_DST_V32: return emitIndirectDst(MI, *BB, *getSubtarget()); @@ -6185,7 +6213,7 @@ MVT Type; unsigned NumElts = Elts.size(); - if (NumElts <= 8) { + if (NumElts <= 12) { Type = MVT::getVectorVT(MVT::f32, NumElts); } else { assert(Elts.size() <= 16); @@ -7735,7 +7763,7 @@ AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, - NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); + NumVDataDwords, NumVAddrDwords); } assert(Opcode != -1); @@ -7801,13 +7829,13 @@ if (!UseNSA) { // Build a single vector containing all the operands so far prepared. - if (NumVAddrDwords > 8) { + if (NumVAddrDwords > 12) { SDValue Undef = DAG.getUNDEF(MVT::i32); Ops.append(16 - Ops.size(), Undef); } - assert(Ops.size() == 8 || Ops.size() == 16); + assert(Ops.size() >= 8 && Ops.size() <= 12); SDValue MergedOps = DAG.getBuildVector( - Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops); + MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops); Ops.clear(); Ops.push_back(MergedOps); } @@ -12466,6 +12494,14 @@ return AMDGPU::VReg_224_Align2RegClassID; case AMDGPU::VReg_256RegClassID: return AMDGPU::VReg_256_Align2RegClassID; + case AMDGPU::VReg_288RegClassID: + return AMDGPU::VReg_288_Align2RegClassID; + case AMDGPU::VReg_320RegClassID: + return AMDGPU::VReg_320_Align2RegClassID; + case AMDGPU::VReg_352RegClassID: + return AMDGPU::VReg_352_Align2RegClassID; + case AMDGPU::VReg_384RegClassID: + return AMDGPU::VReg_384_Align2RegClassID; case AMDGPU::VReg_512RegClassID: return AMDGPU::VReg_512_Align2RegClassID; case AMDGPU::VReg_1024RegClassID: Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1300,6 +1300,14 @@ return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); if (VecSize <= 256) // 32 bytes return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); + if (VecSize <= 288) // 36 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9); + if (VecSize <= 320) // 40 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10); + if (VecSize <= 352) // 44 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11); + if (VecSize <= 384) // 48 bytes + return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12); if (VecSize <= 512) // 64 bytes return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); if (VecSize <= 1024) // 128 bytes @@ -1320,6 +1328,14 @@ return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); if (VecSize <= 256) // 32 bytes return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); + if (VecSize <= 288) // 36 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9); + if (VecSize <= 320) // 40 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10); + if (VecSize <= 352) // 44 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11); + if (VecSize <= 384) // 48 bytes + return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12); if (VecSize <= 512) // 64 bytes return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); if (VecSize <= 1024) // 128 bytes @@ -1341,6 +1357,14 @@ return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; if (VecSize <= 256) // 32 bytes return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; + if (VecSize <= 288) // 36 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9; + if (VecSize <= 320) // 40 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10; + if (VecSize <= 352) // 44 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11; + if (VecSize <= 384) // 48 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12; if (VecSize <= 512) // 64 bytes return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; if (VecSize <= 1024) // 128 bytes @@ -1421,6 +1445,14 @@ return AMDGPU::SI_SPILL_S224_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; + case 36: + return AMDGPU::SI_SPILL_S288_SAVE; + case 40: + return AMDGPU::SI_SPILL_S320_SAVE; + case 44: + return AMDGPU::SI_SPILL_S352_SAVE; + case 48: + return AMDGPU::SI_SPILL_S384_SAVE; case 64: return AMDGPU::SI_SPILL_S512_SAVE; case 128: @@ -1448,6 +1480,14 @@ return AMDGPU::SI_SPILL_V224_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; + case 36: + return AMDGPU::SI_SPILL_S288_SAVE; + case 40: + return AMDGPU::SI_SPILL_S320_SAVE; + case 44: + return AMDGPU::SI_SPILL_S352_SAVE; + case 48: + return AMDGPU::SI_SPILL_S384_SAVE; case 64: return AMDGPU::SI_SPILL_V512_SAVE; case 128: @@ -1588,6 +1628,14 @@ return AMDGPU::SI_SPILL_S224_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; + case 36: + return AMDGPU::SI_SPILL_S288_RESTORE; + case 40: + return AMDGPU::SI_SPILL_S320_RESTORE; + case 44: + return AMDGPU::SI_SPILL_S352_RESTORE; + case 48: + return AMDGPU::SI_SPILL_S384_RESTORE; case 64: return AMDGPU::SI_SPILL_S512_RESTORE; case 128: @@ -1615,6 +1663,14 @@ return AMDGPU::SI_SPILL_V224_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; + case 36: + return AMDGPU::SI_SPILL_V288_RESTORE; + case 40: + return AMDGPU::SI_SPILL_V320_RESTORE; + case 44: + return AMDGPU::SI_SPILL_V352_RESTORE; + case 48: + return AMDGPU::SI_SPILL_V384_RESTORE; case 64: return AMDGPU::SI_SPILL_V512_RESTORE; case 128: @@ -1642,6 +1698,14 @@ return AMDGPU::SI_SPILL_A224_RESTORE; case 32: return AMDGPU::SI_SPILL_A256_RESTORE; + case 36: + return AMDGPU::SI_SPILL_A288_RESTORE; + case 40: + return AMDGPU::SI_SPILL_A320_RESTORE; + case 44: + return AMDGPU::SI_SPILL_A352_RESTORE; + case 48: + return AMDGPU::SI_SPILL_A384_RESTORE; case 64: return AMDGPU::SI_SPILL_A512_RESTORE; case 128: @@ -1669,6 +1733,14 @@ return AMDGPU::SI_SPILL_AV224_RESTORE; case 32: return AMDGPU::SI_SPILL_AV256_RESTORE; + case 36: + return AMDGPU::SI_SPILL_AV288_RESTORE; + case 40: + return AMDGPU::SI_SPILL_AV320_RESTORE; + case 44: + return AMDGPU::SI_SPILL_AV352_RESTORE; + case 48: + return AMDGPU::SI_SPILL_AV384_RESTORE; case 64: return AMDGPU::SI_SPILL_AV512_RESTORE; case 128: @@ -1974,6 +2046,10 @@ case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11: + case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: @@ -2025,6 +2101,10 @@ case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11: + case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { assert(ST.useVGPRIndexMode()); @@ -2064,6 +2144,10 @@ case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11: + case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { assert(ST.useVGPRIndexMode()); @@ -4531,7 +4615,7 @@ } else { const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; - if (AddrWords > 8) + if (AddrWords > 12) AddrWords = 16; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -650,6 +650,10 @@ def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC; @@ -657,6 +661,10 @@ def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST; @@ -698,6 +706,10 @@ def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; +def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo; @@ -735,6 +747,10 @@ def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; +def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo; @@ -751,6 +767,10 @@ def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; +def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo; @@ -784,6 +804,10 @@ defm SI_SPILL_S192 : SI_SPILL_SGPR ; defm SI_SPILL_S224 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; +defm SI_SPILL_S288 : SI_SPILL_SGPR ; +defm SI_SPILL_S320 : SI_SPILL_SGPR ; +defm SI_SPILL_S352 : SI_SPILL_SGPR ; +defm SI_SPILL_S384 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; defm SI_SPILL_S1024 : SI_SPILL_SGPR ; @@ -828,6 +852,10 @@ defm SI_SPILL_V192 : SI_SPILL_VGPR ; defm SI_SPILL_V224 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; +defm SI_SPILL_V288 : SI_SPILL_VGPR ; +defm SI_SPILL_V320 : SI_SPILL_VGPR ; +defm SI_SPILL_V352 : SI_SPILL_VGPR ; +defm SI_SPILL_V384 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; defm SI_SPILL_V1024 : SI_SPILL_VGPR ; @@ -839,6 +867,10 @@ defm SI_SPILL_A192 : SI_SPILL_VGPR ; defm SI_SPILL_A224 : SI_SPILL_VGPR ; defm SI_SPILL_A256 : SI_SPILL_VGPR ; +defm SI_SPILL_A288 : SI_SPILL_VGPR ; +defm SI_SPILL_A320 : SI_SPILL_VGPR ; +defm SI_SPILL_A352 : SI_SPILL_VGPR ; +defm SI_SPILL_A384 : SI_SPILL_VGPR ; defm SI_SPILL_A512 : SI_SPILL_VGPR ; defm SI_SPILL_A1024 : SI_SPILL_VGPR ; @@ -850,6 +882,10 @@ defm SI_SPILL_AV192 : SI_SPILL_VGPR ; defm SI_SPILL_AV224 : SI_SPILL_VGPR ; defm SI_SPILL_AV256 : SI_SPILL_VGPR ; +defm SI_SPILL_AV288 : SI_SPILL_VGPR ; +defm SI_SPILL_AV320 : SI_SPILL_VGPR ; +defm SI_SPILL_AV352 : SI_SPILL_VGPR ; +defm SI_SPILL_AV384 : SI_SPILL_VGPR ; defm SI_SPILL_AV512 : SI_SPILL_VGPR ; defm SI_SPILL_AV1024 : SI_SPILL_VGPR ; @@ -1225,6 +1261,70 @@ >; } +foreach Index = 0-8 in { + def Extract_Element_v9i32_#Index : Extract_Element < + i32, v9i32, Index, !cast(sub#Index) + >; + def Insert_Element_v9i32_#Index : Insert_Element < + i32, v9i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v9f32_#Index : Extract_Element < + f32, v9f32, Index, !cast(sub#Index) + >; + def Insert_Element_v9f32_#Index : Insert_Element < + f32, v9f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-9 in { + def Extract_Element_v10i32_#Index : Extract_Element < + i32, v10i32, Index, !cast(sub#Index) + >; + def Insert_Element_v10i32_#Index : Insert_Element < + i32, v10i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v10f32_#Index : Extract_Element < + f32, v10f32, Index, !cast(sub#Index) + >; + def Insert_Element_v10f32_#Index : Insert_Element < + f32, v10f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-10 in { + def Extract_Element_v11i32_#Index : Extract_Element < + i32, v11i32, Index, !cast(sub#Index) + >; + def Insert_Element_v11i32_#Index : Insert_Element < + i32, v11i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v11f32_#Index : Extract_Element < + f32, v11f32, Index, !cast(sub#Index) + >; + def Insert_Element_v11f32_#Index : Insert_Element < + f32, v11f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-11 in { + def Extract_Element_v12i32_#Index : Extract_Element < + i32, v12i32, Index, !cast(sub#Index) + >; + def Insert_Element_v12i32_#Index : Insert_Element < + i32, v12i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v12f32_#Index : Extract_Element < + f32, v12f32, Index, !cast(sub#Index) + >; + def Insert_Element_v12f32_#Index : Insert_Element < + f32, v12f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-15 in { def Extract_Element_v16i32_#Index : Extract_Element < i32, v16i32, Index, !cast(sub#Index) @@ -1482,6 +1582,30 @@ def : BitConvert ; def : BitConvert ; +// 288-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// 320-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// 320-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// 384-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + // 512-bit bitcast def : BitConvert ; def : BitConvert ; @@ -2022,12 +2146,20 @@ defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2449,6 +2449,14 @@ return &AMDGPU::VReg_224RegClass; if (BitWidth <= 256) return &AMDGPU::VReg_256RegClass; + if (BitWidth <= 288) + return &AMDGPU::VReg_288RegClass; + if (BitWidth <= 320) + return &AMDGPU::VReg_320RegClass; + if (BitWidth <= 352) + return &AMDGPU::VReg_352RegClass; + if (BitWidth <= 384) + return &AMDGPU::VReg_384RegClass; if (BitWidth <= 512) return &AMDGPU::VReg_512RegClass; if (BitWidth <= 1024) @@ -2473,6 +2481,14 @@ return &AMDGPU::VReg_224_Align2RegClass; if (BitWidth <= 256) return &AMDGPU::VReg_256_Align2RegClass; + if (BitWidth <= 288) + return &AMDGPU::VReg_288_Align2RegClass; + if (BitWidth <= 320) + return &AMDGPU::VReg_320_Align2RegClass; + if (BitWidth <= 352) + return &AMDGPU::VReg_352_Align2RegClass; + if (BitWidth <= 384) + return &AMDGPU::VReg_384_Align2RegClass; if (BitWidth <= 512) return &AMDGPU::VReg_512_Align2RegClass; if (BitWidth <= 1024) @@ -2509,6 +2525,14 @@ return &AMDGPU::AReg_224RegClass; if (BitWidth <= 256) return &AMDGPU::AReg_256RegClass; + if (BitWidth <= 288) + return &AMDGPU::AReg_288RegClass; + if (BitWidth <= 320) + return &AMDGPU::AReg_320RegClass; + if (BitWidth <= 352) + return &AMDGPU::AReg_352RegClass; + if (BitWidth <= 384) + return &AMDGPU::AReg_384RegClass; if (BitWidth <= 512) return &AMDGPU::AReg_512RegClass; if (BitWidth <= 1024) @@ -2533,6 +2557,14 @@ return &AMDGPU::AReg_224_Align2RegClass; if (BitWidth <= 256) return &AMDGPU::AReg_256_Align2RegClass; + if (BitWidth <= 288) + return &AMDGPU::AReg_288_Align2RegClass; + if (BitWidth <= 320) + return &AMDGPU::AReg_320_Align2RegClass; + if (BitWidth <= 352) + return &AMDGPU::AReg_352_Align2RegClass; + if (BitWidth <= 384) + return &AMDGPU::AReg_384_Align2RegClass; if (BitWidth <= 512) return &AMDGPU::AReg_512_Align2RegClass; if (BitWidth <= 1024) @@ -2567,6 +2599,14 @@ return &AMDGPU::AV_224RegClass; if (BitWidth <= 256) return &AMDGPU::AV_256RegClass; + if (BitWidth <= 288) + return &AMDGPU::AV_288RegClass; + if (BitWidth <= 320) + return &AMDGPU::AV_320RegClass; + if (BitWidth <= 352) + return &AMDGPU::AV_352RegClass; + if (BitWidth <= 384) + return &AMDGPU::AV_384RegClass; if (BitWidth <= 512) return &AMDGPU::AV_512RegClass; if (BitWidth <= 1024) @@ -2591,6 +2631,14 @@ return &AMDGPU::AV_224_Align2RegClass; if (BitWidth <= 256) return &AMDGPU::AV_256_Align2RegClass; + if (BitWidth <= 288) + return &AMDGPU::AV_288_Align2RegClass; + if (BitWidth <= 320) + return &AMDGPU::AV_320_Align2RegClass; + if (BitWidth <= 352) + return &AMDGPU::AV_352_Align2RegClass; + if (BitWidth <= 384) + return &AMDGPU::AV_384_Align2RegClass; if (BitWidth <= 512) return &AMDGPU::AV_512_Align2RegClass; if (BitWidth <= 1024) @@ -2630,6 +2678,14 @@ return &AMDGPU::SGPR_224RegClass; if (BitWidth <= 256) return &AMDGPU::SGPR_256RegClass; + if (BitWidth <= 288) + return &AMDGPU::SGPR_288RegClass; + if (BitWidth <= 320) + return &AMDGPU::SGPR_320RegClass; + if (BitWidth <= 352) + return &AMDGPU::SGPR_352RegClass; + if (BitWidth <= 384) + return &AMDGPU::SGPR_384RegClass; if (BitWidth <= 512) return &AMDGPU::SGPR_512RegClass; if (BitWidth <= 1024) @@ -2686,6 +2742,26 @@ &AMDGPU::SReg_256RegClass, &AMDGPU::AReg_256_Align2RegClass, &AMDGPU::AReg_256RegClass, + &AMDGPU::VReg_288_Align2RegClass, + &AMDGPU::VReg_288RegClass, + &AMDGPU::SReg_288RegClass, + &AMDGPU::AReg_288_Align2RegClass, + &AMDGPU::AReg_288RegClass, + &AMDGPU::VReg_320_Align2RegClass, + &AMDGPU::VReg_320RegClass, + &AMDGPU::SReg_320RegClass, + &AMDGPU::AReg_320_Align2RegClass, + &AMDGPU::AReg_320RegClass, + &AMDGPU::VReg_352_Align2RegClass, + &AMDGPU::VReg_352RegClass, + &AMDGPU::SReg_352RegClass, + &AMDGPU::AReg_352_Align2RegClass, + &AMDGPU::AReg_352RegClass, + &AMDGPU::VReg_384_Align2RegClass, + &AMDGPU::VReg_384RegClass, + &AMDGPU::SReg_384RegClass, + &AMDGPU::AReg_384_Align2RegClass, + &AMDGPU::AReg_384RegClass, &AMDGPU::VReg_512_Align2RegClass, &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -60,6 +60,16 @@ list ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; list ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6]; list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; + list ret9 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8]; + list ret10 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9]; + list ret11 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10]; + list ret12 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11]; list ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, @@ -80,8 +90,12 @@ !if(!eq(size, 6), ret6, !if(!eq(size, 7), ret7, !if(!eq(size, 8), ret8, - !if(!eq(size, 16), ret16, - ret32)))))))); + !if(!eq(size, 9), ret9, + !if(!eq(size, 10), ret10, + !if(!eq(size, 11), ret11, + !if(!eq(size, 12), ret12, + !if(!eq(size, 16), ret16, + ret32)))))))))))); } // Generates list of sequential register tuple names. @@ -423,6 +437,18 @@ // SGPR 256-bit registers def SGPR_256Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 8, "s">; +// SGPR 288-bit registers. No operations use these, but for symmetry with 288-bit VGPRs. +def SGPR_288Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 9, "s">; + +// SGPR 320-bit registers. No operations use these, but for symmetry with 320-bit VGPRs. +def SGPR_320Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 10, "s">; + +// SGPR 352-bit registers. No operations use these, but for symmetry with 352-bit VGPRs. +def SGPR_352Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 11, "s">; + +// SGPR 384-bit registers. No operations use these, but for symmetry with 384-bit VGPRs. +def SGPR_384Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 12, "s">; + // SGPR 512-bit registers def SGPR_512Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 16, "s">; @@ -465,6 +491,18 @@ // Trap handler TMP 256-bit registers def TTMP_256Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 8, "ttmp">; +// Trap handler TMP 288-bit registers +def TTMP_288Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 9, "ttmp">; + +// Trap handler TMP 320-bit registers +def TTMP_320Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 10, "ttmp">; + +// Trap handler TMP 352-bit registers +def TTMP_352Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 11, "ttmp">; + +// Trap handler TMP 384-bit registers +def TTMP_384Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 12, "ttmp">; + // Trap handler TMP 512-bit registers def TTMP_512Regs : SIRegisterTuples.ret, TTMP_32, 15, 4, 16, "ttmp">; @@ -609,6 +647,18 @@ // VGPR 256-bit registers def VGPR_256 : SIRegisterTuples.ret, VGPR_32, 255, 1, 8, "v">; +// VGPR 288-bit registers +def VGPR_288 : SIRegisterTuples.ret, VGPR_32, 255, 1, 9, "v">; + +// VGPR 320-bit registers +def VGPR_320 : SIRegisterTuples.ret, VGPR_32, 255, 1, 10, "v">; + +// VGPR 352-bit registers +def VGPR_352 : SIRegisterTuples.ret, VGPR_32, 255, 1, 11, "v">; + +// VGPR 384-bit registers +def VGPR_384 : SIRegisterTuples.ret, VGPR_32, 255, 1, 12, "v">; + // VGPR 512-bit registers def VGPR_512 : SIRegisterTuples.ret, VGPR_32, 255, 1, 16, "v">; @@ -653,6 +703,18 @@ // AGPR 256-bit registers def AGPR_256 : SIRegisterTuples.ret, AGPR_32, 255, 1, 8, "a">; +// AGPR 288-bit registers +def AGPR_288 : SIRegisterTuples.ret, AGPR_32, 255, 1, 9, "a">; + +// AGPR 320-bit registers +def AGPR_320 : SIRegisterTuples.ret, AGPR_32, 255, 1, 10, "a">; + +// AGPR 352-bit registers +def AGPR_352 : SIRegisterTuples.ret, AGPR_32, 255, 1, 11, "a">; + +// AGPR 384-bit registers +def AGPR_384 : SIRegisterTuples.ret, AGPR_32, 255, 1, 12, "a">; + // AGPR 512-bit registers def AGPR_512 : SIRegisterTuples.ret, AGPR_32, 255, 1, 16, "a">; @@ -829,6 +891,10 @@ defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>; +defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>; +defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>; +defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; +defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>; let GlobalPriority = true in { defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>; @@ -873,6 +939,10 @@ defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>; +defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>; +defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>; +defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; +defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>; let GlobalPriority = true in { defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; @@ -897,6 +967,10 @@ defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; +defm AReg_288 : ARegClass<9, [v9i32, v9f32], (add AGPR_288)>; +defm AReg_320 : ARegClass<10, [v10i32, v10f32], (add AGPR_320)>; +defm AReg_352 : ARegClass<11, [v11i32, v11f32], (add AGPR_352)>; +defm AReg_384 : ARegClass<12, [v12i32, v12f32], (add AGPR_384)>; let GlobalPriority = true in { defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; @@ -963,6 +1037,10 @@ defm AV_192 : AVRegClass<6, VReg_192.RegTypes, (add VGPR_192), (add AGPR_192)>; defm AV_224 : AVRegClass<7, VReg_224.RegTypes, (add VGPR_224), (add AGPR_224)>; defm AV_256 : AVRegClass<8, VReg_256.RegTypes, (add VGPR_256), (add AGPR_256)>; +defm AV_288 : AVRegClass<9, VReg_288.RegTypes, (add VGPR_288), (add AGPR_288)>; +defm AV_320 : AVRegClass<10, VReg_320.RegTypes, (add VGPR_320), (add AGPR_320)>; +defm AV_352 : AVRegClass<11, VReg_352.RegTypes, (add VGPR_352), (add AGPR_352)>; +defm AV_384 : AVRegClass<12, VReg_384.RegTypes, (add VGPR_384), (add AGPR_384)>; let GlobalPriority = true in { defm AV_512 : AVRegClass<16, VReg_512.RegTypes, (add VGPR_512), (add AGPR_512)>; Index: llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -292,6 +292,14 @@ RC = &AMDGPU::VReg_224RegClass; } else if (Info->VAddrDwords == 8) { RC = &AMDGPU::VReg_256RegClass; + } else if (Info->VAddrDwords == 9) { + RC = &AMDGPU::VReg_288RegClass; + } else if (Info->VAddrDwords == 10) { + RC = &AMDGPU::VReg_320RegClass; + } else if (Info->VAddrDwords == 11) { + RC = &AMDGPU::VReg_352RegClass; + } else if (Info->VAddrDwords == 12) { + RC = &AMDGPU::VReg_384RegClass; } else { RC = &AMDGPU::VReg_512RegClass; NewAddrDwords = 16; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2175,6 +2175,42 @@ case AMDGPU::AV_256RegClassID: case AMDGPU::AV_256_Align2RegClassID: return 256; + case AMDGPU::SGPR_288RegClassID: + case AMDGPU::SReg_288RegClassID: + case AMDGPU::VReg_288RegClassID: + case AMDGPU::AReg_288RegClassID: + case AMDGPU::VReg_288_Align2RegClassID: + case AMDGPU::AReg_288_Align2RegClassID: + case AMDGPU::AV_288RegClassID: + case AMDGPU::AV_288_Align2RegClassID: + return 288; + case AMDGPU::SGPR_320RegClassID: + case AMDGPU::SReg_320RegClassID: + case AMDGPU::VReg_320RegClassID: + case AMDGPU::AReg_320RegClassID: + case AMDGPU::VReg_320_Align2RegClassID: + case AMDGPU::AReg_320_Align2RegClassID: + case AMDGPU::AV_320RegClassID: + case AMDGPU::AV_320_Align2RegClassID: + return 320; + case AMDGPU::SGPR_352RegClassID: + case AMDGPU::SReg_352RegClassID: + case AMDGPU::VReg_352RegClassID: + case AMDGPU::AReg_352RegClassID: + case AMDGPU::VReg_352_Align2RegClassID: + case AMDGPU::AReg_352_Align2RegClassID: + case AMDGPU::AV_352RegClassID: + case AMDGPU::AV_352_Align2RegClassID: + return 352; + case AMDGPU::SGPR_384RegClassID: + case AMDGPU::SReg_384RegClassID: + case AMDGPU::VReg_384RegClassID: + case AMDGPU::AReg_384RegClassID: + case AMDGPU::VReg_384_Align2RegClassID: + case AMDGPU::AReg_384_Align2RegClassID: + case AMDGPU::AV_384RegClassID: + case AMDGPU::AV_384_Align2RegClassID: + return 384; case AMDGPU::SGPR_512RegClassID: case AMDGPU::SReg_512RegClassID: case AMDGPU::VReg_512RegClassID: Index: llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -15,7 +15,7 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef -; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef +; ALL-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'add_i32' @@ -27,7 +27,7 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v6i32 = add <6 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v7i32 = add <7 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = add <8 x i32> undef, undef -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9i32 = add <9 x i32> undef, undef +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9i32 = add <9 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %i32 = add i32 undef, undef Index: llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll +++ llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll @@ -50,7 +50,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -79,7 +79,7 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -108,7 +108,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -137,7 +137,7 @@ ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -230,7 +230,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -259,7 +259,7 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -288,7 +288,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -317,7 +317,7 @@ ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) Index: llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll +++ llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll @@ -50,7 +50,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -79,7 +79,7 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -108,7 +108,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -137,7 +137,7 @@ ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -230,7 +230,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -259,7 +259,7 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -288,7 +288,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) @@ -317,7 +317,7 @@ ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) Index: llvm/test/Analysis/CostModel/AMDGPU/fadd.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -15,7 +15,7 @@ ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fadd_f32' @@ -25,7 +25,7 @@ ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32' @@ -35,7 +35,7 @@ ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32' @@ -45,7 +45,7 @@ ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = fadd float undef, undef Index: llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -20,7 +20,7 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef -; ALL-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef +; ALL-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'fdiv_f32_ieee' @@ -30,7 +30,7 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f32 = fdiv <4 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v5f32 = fdiv <5 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f32 = fdiv <8 x float> undef, undef -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v9f32 = fdiv <9 x float> undef, undef +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %v9f32 = fdiv <9 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = fdiv float undef, undef @@ -51,7 +51,7 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f32 = fdiv <4 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v5f32 = fdiv <5 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8f32 = fdiv <8 x float> undef, undef -; ALL-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %v9f32 = fdiv <9 x float> undef, undef +; ALL-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %v9f32 = fdiv <9 x float> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'fdiv_f32_ftzdaz' @@ -61,7 +61,7 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v4f32 = fdiv <4 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %v5f32 = fdiv <5 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v8f32 = fdiv <8 x float> undef, undef -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v9f32 = fdiv <9 x float> undef, undef +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %v9f32 = fdiv <9 x float> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = fdiv float undef, undef Index: llvm/test/Analysis/CostModel/AMDGPU/fma.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -17,7 +17,7 @@ ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fma_f32' @@ -27,7 +27,7 @@ ; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f32' @@ -37,7 +37,7 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fma_f32' @@ -47,7 +47,7 @@ ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; FASTF64-SIZE-LABEL: 'fma_f32' @@ -57,7 +57,7 @@ ; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f32' @@ -67,7 +67,7 @@ ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1 Index: llvm/test/Analysis/CostModel/AMDGPU/fmul.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -15,7 +15,7 @@ ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; F32-LABEL: 'fmul_f32' @@ -25,7 +25,7 @@ ; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef ; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef ; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef +; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef ; F32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f32' @@ -35,7 +35,7 @@ ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f32' @@ -45,7 +45,7 @@ ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef ; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef ; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fmul <9 x float> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = fmul float undef, undef Index: llvm/test/Analysis/CostModel/AMDGPU/fsub.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -15,7 +15,7 @@ ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fsub_f32' @@ -25,7 +25,7 @@ ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32' @@ -35,7 +35,7 @@ ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32' @@ -45,7 +45,7 @@ ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f32 = fsub float undef, undef Index: llvm/test/Analysis/CostModel/AMDGPU/mul.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -13,7 +13,7 @@ ; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4i32 = mul <4 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5i32 = mul <5 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8i32 = mul <8 x i32> undef, undef -; ALL-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9i32 = mul <9 x i32> undef, undef +; ALL-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9i32 = mul <9 x i32> undef, undef ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; ALL-SIZE-LABEL: 'mul_i32' @@ -23,7 +23,7 @@ ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = mul <4 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5i32 = mul <5 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i32 = mul <8 x i32> undef, undef -; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9i32 = mul <9 x i32> undef, undef +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9i32 = mul <9 x i32> undef, undef ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %i32 = mul i32 undef, undef Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -2671,6 +2671,1208 @@ ret void } +define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_s(<9 x float> inreg %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s8 +; GPRIDX-NEXT: s_set_gpr_idx_on s11, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v9f32_s_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_mov_b32 m0, s11 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: v_movreld_b32_e32 v0, v9 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v9f32_s_v_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_mov_b32 m0, s11 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_movreld_b32_e32 v0, v9 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <9 x float> %vec, float %val, i32 %idx + ret <9 x float> %insert +} + +define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_s_v_v(<9 x float> inreg %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v9f32_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v9 +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v9f32_s_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v18, s8 +; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: v_mov_b32_e32 v12, s2 +; GFX10-NEXT: v_mov_b32_e32 v13, s3 +; GFX10-NEXT: v_mov_b32_e32 v14, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v16, s6 +; GFX10-NEXT: v_mov_b32_e32 v17, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, v10 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v9f32_s_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: v_dual_mov_b32 v18, s8 :: v_dual_mov_b32 v17, s7 +; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_dual_mov_b32 v14, s4 :: v_dual_mov_b32 v13, s3 +; GFX11-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v15, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, v9 :: v_dual_cndmask_b32 v8, v18, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <9 x float> %vec, float %val, i32 %idx + ret <9 x float> %insert +} + +define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_s(<9 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v9 +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <9 x float> %vec, float %val, i32 %idx + ret <9 x float> %insert +} + +define amdgpu_ps <9 x float> @dyn_insertelement_v9f32_v_v_v(<9 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v9f32_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v10 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v9f32_v_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v10 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <9 x float> %vec, float %val, i32 %idx + ret <9 x float> %insert +} + +define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_s(<10 x float> inreg %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s9 +; GPRIDX-NEXT: s_set_gpr_idx_on s12, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v10f32_s_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_mov_b32 m0, s12 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s9 +; GFX10-NEXT: v_movreld_b32_e32 v0, v10 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v10f32_s_v_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 m0, s12 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_movreld_b32_e32 v0, v10 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <10 x float> %vec, float %val, i32 %idx + ret <10 x float> %insert +} + +define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_s_v_v(<10 x float> inreg %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v10f32_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v19, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 8, v1 +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v18, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v19, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v10f32_s_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: v_mov_b32_e32 v19, s9 +; GFX10-NEXT: v_mov_b32_e32 v10, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v11, s1 +; GFX10-NEXT: v_mov_b32_e32 v12, s2 +; GFX10-NEXT: v_mov_b32_e32 v13, s3 +; GFX10-NEXT: v_mov_b32_e32 v14, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v15, s5 +; GFX10-NEXT: v_mov_b32_e32 v16, s6 +; GFX10-NEXT: v_mov_b32_e32 v17, s7 +; GFX10-NEXT: v_mov_b32_e32 v18, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v19, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, v10 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v10f32_s_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: v_dual_mov_b32 v19, s9 :: v_dual_mov_b32 v18, s8 +; GFX11-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2 +; GFX11-NEXT: v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v14, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v15, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v16, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v11 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v19, v0 :: v_dual_mov_b32 v0, v10 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <10 x float> %vec, float %val, i32 %idx + ret <10 x float> %insert +} + +define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_s(<10 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v10 +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <10 x float> %vec, float %val, i32 %idx + ret <10 x float> %insert +} + +define amdgpu_ps <10 x float> @dyn_insertelement_v10f32_v_v_v(<10 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v10f32_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v11 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v10f32_v_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v11 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <10 x float> %vec, float %val, i32 %idx + ret <10 x float> %insert +} + +define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_s(<11 x float> inreg %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s10 +; GPRIDX-NEXT: s_set_gpr_idx_on s13, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v11 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v11f32_s_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_mov_b32 m0, s13 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s9 +; GFX10-NEXT: v_mov_b32_e32 v10, s10 +; GFX10-NEXT: v_movreld_b32_e32 v0, v11 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v11f32_s_v_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: s_mov_b32 m0, s13 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s10 +; GFX11-NEXT: v_movreld_b32_e32 v0, v11 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <11 x float> %vec, float %val, i32 %idx + ret <11 x float> %insert +} + +define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_s_v_v(<11 x float> inreg %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v11f32_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v22, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v21, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v20, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v19, s7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 9, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 10, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v1 +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v21, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v22, v0, s[2:3] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v11f32_s_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: v_mov_b32_e32 v22, s10 +; GFX10-NEXT: v_mov_b32_e32 v12, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v14, s2 +; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v16, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v17, s5 +; GFX10-NEXT: v_mov_b32_e32 v18, s6 +; GFX10-NEXT: v_mov_b32_e32 v19, s7 +; GFX10-NEXT: v_mov_b32_e32 v20, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: v_mov_b32_e32 v21, s9 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, v12 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v11f32_s_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: v_dual_mov_b32 v22, s10 :: v_dual_mov_b32 v21, s9 +; GFX11-NEXT: v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v13, s1 +; GFX11-NEXT: v_mov_b32_e32 v12, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v15, s3 +; GFX11-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v17, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_dual_mov_b32 v20, s8 :: v_dual_mov_b32 v19, s7 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_cndmask_b32 v10, v22, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, v12 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <11 x float> %vec, float %val, i32 %idx + ret <11 x float> %insert +} + +define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_s(<11 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v11 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v11 +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <11 x float> %vec, float %val, i32 %idx + ret <11 x float> %insert +} + +define amdgpu_ps <11 x float> @dyn_insertelement_v11f32_v_v_v(<11 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v11f32_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v11f32_v_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v12 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <11 x float> %vec, float %val, i32 %idx + ret <11 x float> %insert +} + +define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_s(<12 x float> inreg %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s11 +; GPRIDX-NEXT: s_set_gpr_idx_on s14, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v12 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v12f32_s_v_s: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_mov_b32 m0, s14 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: v_mov_b32_e32 v6, s6 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: v_mov_b32_e32 v8, s8 +; GFX10-NEXT: v_mov_b32_e32 v9, s9 +; GFX10-NEXT: v_mov_b32_e32 v10, s10 +; GFX10-NEXT: v_mov_b32_e32 v11, s11 +; GFX10-NEXT: v_movreld_b32_e32 v0, v12 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v12f32_s_v_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 m0, s14 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v7, s7 :: v_dual_mov_b32 v6, s6 +; GFX11-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v8, s8 +; GFX11-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v10, s10 +; GFX11-NEXT: v_movreld_b32_e32 v0, v12 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <12 x float> %vec, float %val, i32 %idx + ret <12 x float> %insert +} + +define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_s_v_v(<12 x float> inreg %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v12f32_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v23, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v22, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v21, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v20, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v19, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 8, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 9, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 10, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 11, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v1 +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v20, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v21, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v22, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v23, v0, s[6:7] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v13 +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: dyn_insertelement_v12f32_s_v_v: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: s_mov_b32 s9, s11 +; GFX10-NEXT: s_mov_b32 s11, s13 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 +; GFX10-NEXT: s_mov_b32 s10, s12 +; GFX10-NEXT: v_mov_b32_e32 v23, s11 +; GFX10-NEXT: v_mov_b32_e32 v12, s0 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v14, s2 +; GFX10-NEXT: v_mov_b32_e32 v15, s3 +; GFX10-NEXT: v_mov_b32_e32 v16, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v17, s5 +; GFX10-NEXT: v_mov_b32_e32 v18, s6 +; GFX10-NEXT: v_mov_b32_e32 v19, s7 +; GFX10-NEXT: v_mov_b32_e32 v20, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX10-NEXT: v_mov_b32_e32 v21, s9 +; GFX10-NEXT: v_mov_b32_e32 v22, s10 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v23, v0, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v0, v12 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_insertelement_v12f32_s_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10 +; GFX11-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2 +; GFX11-NEXT: v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 +; GFX11-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v14, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v15, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v18, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v20, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v22, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v13 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v23, v0 :: v_dual_mov_b32 v0, v12 +; GFX11-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <12 x float> %vec, float %val, i32 %idx + ret <12 x float> %insert +} + +define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_s(<12 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v12 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movreld_b32_e32 v0, v12 +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <12 x float> %vec, float %val, i32 %idx + ret <12 x float> %insert +} + +define amdgpu_ps <12 x float> @dyn_insertelement_v12f32_v_v_v(<12 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v12f32_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 8, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 9, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 10, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 11, v13 +; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc +; GPRIDX-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: dyn_insertelement_v12f32_v_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v8, v8, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v13 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v11, v11, v12, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <12 x float> %vec, float %val, i32 %idx + ret <12 x float> %insert +} + define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_s_s(<16 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i32_s_s_s: ; GPRIDX: ; %bb.0: ; %entry @@ -5246,47 +6448,41 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s10 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 2 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5] ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 3 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v4 @@ -5301,22 +6497,16 @@ ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: v_mov_b32_e32 v17, s15 -; GFX10-NEXT: v_mov_b32_e32 v16, s14 -; GFX10-NEXT: v_mov_b32_e32 v15, s13 -; GFX10-NEXT: v_mov_b32_e32 v14, s12 -; GFX10-NEXT: v_mov_b32_e32 v13, s11 -; GFX10-NEXT: v_mov_b32_e32 v12, s10 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; GFX10-NEXT: v_mov_b32_e32 v9, s7 @@ -5356,19 +6546,16 @@ ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_s: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: v_dual_mov_b32 v17, s15 :: v_dual_mov_b32 v16, s14 -; GFX11-NEXT: v_dual_mov_b32 v15, s13 :: v_dual_mov_b32 v14, s12 -; GFX11-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v12, s10 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: v_dual_mov_b32 v11, s9 :: v_dual_mov_b32 v10, s8 ; GFX11-NEXT: v_dual_mov_b32 v9, s7 :: v_dual_mov_b32 v8, s6 ; GFX11-NEXT: v_dual_mov_b32 v7, s5 :: v_dual_mov_b32 v6, s4 @@ -5406,77 +6593,65 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_v(<5 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[4:5] ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc -; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[6:7] -; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] -; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v10, v1, s[2:3] -; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 4, v2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[0:1] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 ; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 ; GPRIDX-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: v_mov_b32_e32 v18, s15 -; GFX10-NEXT: v_mov_b32_e32 v17, s14 -; GFX10-NEXT: v_mov_b32_e32 v16, s13 -; GFX10-NEXT: v_mov_b32_e32 v15, s12 -; GFX10-NEXT: v_mov_b32_e32 v14, s11 -; GFX10-NEXT: v_mov_b32_e32 v13, s10 +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s8, s10 ; GFX10-NEXT: v_mov_b32_e32 v12, s9 ; GFX10-NEXT: v_mov_b32_e32 v11, s8 ; GFX10-NEXT: v_mov_b32_e32 v10, s7 @@ -5516,19 +6691,16 @@ ; ; GFX11-LABEL: dyn_insertelement_v5f64_s_v_v: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 -; GFX11-NEXT: s_mov_b32 s4, s6 ; GFX11-NEXT: s_mov_b32 s5, s7 -; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 -; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: s_mov_b32 s9, s11 -; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14 -; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12 -; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10 +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s8, s10 ; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8 ; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6 ; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4 @@ -5596,55 +6768,55 @@ ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_s: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s4, v4 ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_s: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s2, 2 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s2, 4 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11 -; GFX11-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: @@ -5685,14 +6857,19 @@ ; GFX10-LABEL: dyn_insertelement_v5f64_v_v_v: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v12 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v11, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX10-NEXT: v_readfirstlane_b32 s8, v8 +; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -5702,38 +6879,33 @@ ; GFX10-NEXT: v_readfirstlane_b32 s5, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 ; GFX10-NEXT: v_readfirstlane_b32 s6, v6 ; GFX10-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo -; GFX10-NEXT: v_readfirstlane_b32 s8, v8 -; GFX10-NEXT: v_readfirstlane_b32 s9, v9 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: dyn_insertelement_v5f64_v_v_v: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 4, v12 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v10, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v11, s1 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v10 :: v_dual_cndmask_b32 v5, v5, v11 -; GFX11-NEXT: v_readfirstlane_b32 s6, v6 -; GFX11-NEXT: v_readfirstlane_b32 s7, v7 -; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v6, v10 :: v_dual_cndmask_b32 v7, v7, v11 ; GFX11-NEXT: v_readfirstlane_b32 s4, v4 ; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s8, v8 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 ; GFX11-NEXT: v_readfirstlane_b32 s9, v9 ; GFX11-NEXT: ; return to shader part epilog entry: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir @@ -688,7 +688,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_192 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384 = REG_SEQUENCE [[DEF]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[DEF1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11 ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(<3 x s64>) = G_IMPLICIT_DEF %1:sgpr(<3 x s64>) = G_IMPLICIT_DEF Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir @@ -296,7 +296,7 @@ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_96 = COPY $sgpr3_sgpr4_sgpr5 ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY $sgpr6_sgpr7_sgpr8 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_96 = COPY $sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_512_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_384_with_sub0_sub1_sub2 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3_sub4_sub5, [[COPY2]], %subreg.sub6_sub7_sub8, [[COPY3]], %subreg.sub9_sub10_sub11 ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5 ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8 @@ -332,7 +332,7 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_192 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_192 = COPY $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 - ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_384 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1_sub2_sub3_sub4_sub5, [[COPY1]], %subreg.sub6_sub7_sub8_sub9_sub10_sub11 ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub0_sub1_sub2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub3_sub4_sub5 ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]].sub6_sub7_sub8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -18,7 +18,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray: ; GCN: ; %bb.0: -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog ; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget @@ -30,7 +30,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_flat: ; GCN: ; %bb.0: -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 @@ -78,7 +78,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray: ; GCN: ; %bb.0: -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) @@ -89,7 +89,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_flat: ; GCN: ; %bb.0: -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 @@ -118,7 +118,7 @@ ; GFX10-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX10-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 ; GFX10-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 -; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -159,7 +159,7 @@ ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:30], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[15:25], s[4:7] ; GFX1030-NEXT: ; implicit-def: $vgpr11 ; GFX1030-NEXT: ; implicit-def: $vgpr15 ; GFX1030-NEXT: ; implicit-def: $vgpr16 @@ -182,34 +182,30 @@ ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v16, v11 -; GFX1013-NEXT: v_mov_b32_e32 v17, v12 -; GFX1013-NEXT: v_mov_b32_e32 v18, v13 -; GFX1013-NEXT: v_mov_b32_e32 v19, v14 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh_intersect_ray v[20:23], v[0:15], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr16 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 +; GFX1013-NEXT: image_bvh_intersect_ray v[15:18], v[0:10], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr11 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 +; GFX1013-NEXT: ; implicit-def: $vgpr11_vgpr12_vgpr13_vgpr14 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB6_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v20 -; GFX1013-NEXT: v_mov_b32_e32 v1, v21 -; GFX1013-NEXT: v_mov_b32_e32 v2, v22 -; GFX1013-NEXT: v_mov_b32_e32 v3, v23 +; GFX1013-NEXT: v_mov_b32_e32 v0, v15 +; GFX1013-NEXT: v_mov_b32_e32 v1, v16 +; GFX1013-NEXT: v_mov_b32_e32 v2, v17 +; GFX1013-NEXT: v_mov_b32_e32 v3, v18 ; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: @@ -391,7 +387,7 @@ ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:31], s[4:7] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[16:27], s[4:7] ; GFX1030-NEXT: ; implicit-def: $vgpr12 ; GFX1030-NEXT: ; implicit-def: $vgpr16 ; GFX1030-NEXT: ; implicit-def: $vgpr17 @@ -415,34 +411,30 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v16, v12 -; GFX1013-NEXT: v_mov_b32_e32 v17, v13 -; GFX1013-NEXT: v_mov_b32_e32 v18, v14 -; GFX1013-NEXT: v_mov_b32_e32 v19, v15 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v13 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v14 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v15 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] -; GFX1013-NEXT: ; implicit-def: $vgpr16 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 +; GFX1013-NEXT: image_bvh64_intersect_ray v[16:19], v[0:11], s[4:7] +; GFX1013-NEXT: ; implicit-def: $vgpr12 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 +; GFX1013-NEXT: ; implicit-def: $vgpr12_vgpr13_vgpr14_vgpr15 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB8_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v20 -; GFX1013-NEXT: v_mov_b32_e32 v1, v21 -; GFX1013-NEXT: v_mov_b32_e32 v2, v22 -; GFX1013-NEXT: v_mov_b32_e32 v3, v23 +; GFX1013-NEXT: v_mov_b32_e32 v0, v16 +; GFX1013-NEXT: v_mov_b32_e32 v1, v17 +; GFX1013-NEXT: v_mov_b32_e32 v2, v18 +; GFX1013-NEXT: v_mov_b32_e32 v3, v19 ; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr: @@ -508,7 +500,7 @@ ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:29], s[4:7] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[14:22], s[4:7] a16 ; GFX1030-NEXT: ; implicit-def: $vgpr10 ; GFX1030-NEXT: ; implicit-def: $vgpr14 ; GFX1030-NEXT: ; implicit-def: $vgpr15 @@ -529,42 +521,38 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: v_mov_b32_e32 v16, v10 -; GFX1013-NEXT: v_mov_b32_e32 v17, v11 -; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1013-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX1013-NEXT: v_and_b32_e32 v15, 0xffff, v8 ; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX1013-NEXT: v_mov_b32_e32 v18, v12 -; GFX1013-NEXT: v_mov_b32_e32 v19, v13 -; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 -; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 +; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 +; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v15 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 -; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 -; GFX1013-NEXT: v_readfirstlane_b32 s6, v18 -; GFX1013-NEXT: v_readfirstlane_b32 s7, v19 -; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[16:17] -; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[18:19] +; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 +; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 +; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 +; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 +; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] +; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1013-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 -; GFX1013-NEXT: image_bvh64_intersect_ray v[20:23], v[0:15], s[4:7] a16 -; GFX1013-NEXT: ; implicit-def: $vgpr16 -; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX1013-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19 +; GFX1013-NEXT: image_bvh64_intersect_ray v[14:17], v[0:8], s[4:7] a16 +; GFX1013-NEXT: ; implicit-def: $vgpr10 +; GFX1013-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 +; GFX1013-NEXT: ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13 ; GFX1013-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX1013-NEXT: s_cbranch_execnz .LBB9_1 ; GFX1013-NEXT: ; %bb.2: ; GFX1013-NEXT: s_mov_b32 exec_lo, s1 ; GFX1013-NEXT: s_waitcnt vmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, v20 -; GFX1013-NEXT: v_mov_b32_e32 v1, v21 -; GFX1013-NEXT: v_mov_b32_e32 v2, v22 -; GFX1013-NEXT: v_mov_b32_e32 v3, v23 +; GFX1013-NEXT: v_mov_b32_e32 v0, v14 +; GFX1013-NEXT: v_mov_b32_e32 v1, v15 +; GFX1013-NEXT: v_mov_b32_e32 v2, v16 +; GFX1013-NEXT: v_mov_b32_e32 v3, v17 ; GFX1013-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: @@ -631,7 +619,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -661,7 +649,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -885,7 +873,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -914,7 +902,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -1012,7 +1000,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v7, s4 ; GFX1030-NEXT: v_mov_b32_e32 v8, s6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -1056,7 +1044,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v7, s0 ; GFX1013-NEXT: v_mov_b32_e32 v8, s2 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s # Check that %11 and %20 have been coalesced. -# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]] -# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]] +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG:[0-9]+]] +# CHECK: IMAGE_SAMPLE_C_D_O_V1_V11 %[[REG]] --- name: main @@ -17,9 +17,9 @@ - { id: 6, class: sgpr_128 } - { id: 7, class: sgpr_512 } - { id: 9, class: vreg_512 } - - { id: 11, class: vreg_512 } + - { id: 11, class: vreg_352 } - { id: 18, class: vgpr_32 } - - { id: 20, class: vreg_512 } + - { id: 20, class: vreg_352 } - { id: 27, class: vgpr_32 } liveins: - { reg: '$sgpr2_sgpr3', virtual-reg: '%0' } @@ -61,7 +61,7 @@ %11.sub6 = COPY %1 %11.sub7 = COPY %1 %11.sub8 = COPY %1 - dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32)) + dead %18 = IMAGE_SAMPLE_C_D_O_V1_V11 %11, %3, %4, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32)) %20.sub1 = COPY %2 %20.sub2 = COPY %2 %20.sub3 = COPY %2 @@ -70,6 +70,6 @@ %20.sub6 = COPY %2 %20.sub7 = COPY %2 %20.sub8 = COPY %2 - dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32)) + dead %27 = IMAGE_SAMPLE_C_D_O_V1_V11 %20, %5, %6, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s32)) ... Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -171,14 +171,20 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s26, -1 +; SI-NEXT: s_mov_b32 s27, 0xe8f000 +; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s18, s10 ; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_mov_b32 s22, s10 ; SI-NEXT: s_mov_b32 s23, s11 @@ -197,24 +203,30 @@ ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s3 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_addc_u32 s89, s89, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s18, s10 ; VI-NEXT: s_mov_b32 s19, s11 -; VI-NEXT: s_mov_b32 s22, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_mov_b32 s22, s10 ; VI-NEXT: s_mov_b32 s23, s11 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 Index: llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll +++ llvm/test/CodeGen/AMDGPU/copy-to-reg-scc-clobber.ll @@ -7,21 +7,27 @@ ; RRLIST-LABEL: sccClobber: ; RRLIST: ; %bb.0: ; %entry ; RRLIST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; RRLIST-NEXT: v_mov_b32_e32 v2, 0 +; RRLIST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; RRLIST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; RRLIST-NEXT: s_mov_b32 s22, -1 +; RRLIST-NEXT: s_mov_b32 s23, 0xe00000 +; RRLIST-NEXT: s_add_u32 s20, s20, s3 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) ; RRLIST-NEXT: s_load_dword s16, s[8:9], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 ; RRLIST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; RRLIST-NEXT: s_load_dword s17, s[10:11], 0x0 +; RRLIST-NEXT: s_addc_u32 s21, s21, 0 ; RRLIST-NEXT: s_waitcnt lgkmcnt(0) -; RRLIST-NEXT: s_min_i32 s4, s16, 0 ; RRLIST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; RRLIST-NEXT: s_min_i32 s4, s16, 0 ; RRLIST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; RRLIST-NEXT: s_and_b64 s[0:1], vcc, exec ; RRLIST-NEXT: s_cselect_b32 s0, s16, s17 ; RRLIST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] ; RRLIST-NEXT: s_cselect_b32 s0, s4, s0 +; RRLIST-NEXT: v_mov_b32_e32 v2, 0 ; RRLIST-NEXT: v_mov_b32_e32 v0, s0 ; RRLIST-NEXT: global_store_dword v2, v0, s[14:15] ; RRLIST-NEXT: s_endpgm @@ -29,21 +35,27 @@ ; FAST-LABEL: sccClobber: ; FAST: ; %bb.0: ; %entry ; FAST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; FAST-NEXT: v_mov_b32_e32 v2, 0 +; FAST-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; FAST-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; FAST-NEXT: s_mov_b32 s22, -1 +; FAST-NEXT: s_mov_b32 s23, 0xe00000 +; FAST-NEXT: s_add_u32 s20, s20, s3 ; FAST-NEXT: s_waitcnt lgkmcnt(0) ; FAST-NEXT: s_load_dword s16, s[8:9], 0x0 ; FAST-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; FAST-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 ; FAST-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x44 ; FAST-NEXT: s_load_dword s17, s[10:11], 0x0 +; FAST-NEXT: s_addc_u32 s21, s21, 0 ; FAST-NEXT: s_waitcnt lgkmcnt(0) -; FAST-NEXT: s_min_i32 s4, s16, 0 ; FAST-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; FAST-NEXT: s_min_i32 s4, s16, 0 ; FAST-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; FAST-NEXT: s_and_b64 s[0:1], vcc, exec ; FAST-NEXT: s_cselect_b32 s0, s16, s17 ; FAST-NEXT: s_cmp_eq_u64 s[12:13], s[2:3] ; FAST-NEXT: s_cselect_b32 s0, s4, s0 +; FAST-NEXT: v_mov_b32_e32 v2, 0 ; FAST-NEXT: v_mov_b32_e32 v0, s0 ; FAST-NEXT: global_store_dword v2, v0, s[14:15] ; FAST-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/function-returns.ll +++ llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -310,7 +310,7 @@ ; GCN-LABEL: {{^}}v5i64_func_void: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx2 v[8:9], off ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <5 x i64> @v5i64_func_void() #0 { Index: llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -385,22 +385,19 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]] ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 +; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: s_waitcnt ; MOVREL: s_add_i32 m0, [[ARG]], -16 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 ; MOVREL: s_mov_b32 m0, -1 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 +; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; IDXMODE: s_waitcnt ; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16 -; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST) ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0 -; IDXMODE: s_set_gpr_idx_off -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd -; IDXMODE: s_set_gpr_idx_on [[ARG]], gpr_idx(DST) ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0 ; IDXMODE: s_set_gpr_idx_off Index: llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -610,10 +610,16 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) { ; GCN-LABEL: double5_inselt: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s18, -1 +; GCN-NEXT: s_mov_b32 s19, 0xe80000 +; GCN-NEXT: s_add_u32 s16, s16, s3 ; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84 ; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 +; GCN-NEXT: s_addc_u32 s17, s17, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s12, 4 ; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 @@ -622,10 +628,8 @@ ; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3 ; GCN-NEXT: s_cselect_b32 s2, 0, s2 ; GCN-NEXT: s_cmp_eq_u32 s12, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NEXT: s_cselect_b32 s8, 0x3ff00000, s1 -; GCN-NEXT: s_cselect_b32 s9, 0, s0 +; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1 +; GCN-NEXT: s_cselect_b32 s14, 0, s0 ; GCN-NEXT: s_cmp_eq_u32 s12, 3 ; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7 ; GCN-NEXT: s_cselect_b32 s1, 0, s6 @@ -636,23 +640,26 @@ ; GCN-NEXT: s_add_u32 s0, s10, 16 ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: v_mov_b32_e32 v7, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-NEXT: v_mov_b32_e32 v6, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: s_add_u32 s0, s10, 32 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: s_add_u32 s0, s10, 32 -; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; GCN-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -6,7 +6,7 @@ ; FIXME: For some reason the 8 and 16 vectors are being stored as ; individual elements instead of 128-bit stores. -define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -37,7 +37,7 @@ ret void } -define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -68,7 +68,7 @@ ret void } -define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -99,7 +99,7 @@ ret void } -define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { +define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -132,7 +132,7 @@ ; FIXME: Why is the constant moved into the intermediate register and ; not just directly into the vector component? -define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -167,7 +167,7 @@ ret void } -define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -202,7 +202,7 @@ ret void } -define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -237,7 +237,7 @@ ret void } -define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind { ; SI-LABEL: insertelement_v4f32_3: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -272,7 +272,7 @@ ret void } -define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { +define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind { ; SI-LABEL: insertelement_v4i32_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -307,7 +307,7 @@ ret void } -define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -338,7 +338,7 @@ ret void } -define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; SI-LABEL: insertelement_v3f32_2: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -369,7 +369,7 @@ ret void } -define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { +define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind { ; GCN-LABEL: insertelement_v3f32_3: ; GCN: ; %bb.0: ; GCN-NEXT: s_endpgm @@ -394,7 +394,107 @@ ret <4 x float> %tmp2 } -define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { +define <9 x float> @insertelement_to_v9f32_undef() nounwind { +; GCN-LABEL: insertelement_to_v9f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000 +; GCN-NEXT: v_mov_b32_e32 v2, 0xc0a00000 +; GCN-NEXT: v_mov_b32_e32 v7, 0x41880000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] + %tmp = load <9 x float>, <9 x float> addrspace(4)* undef + %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0 + %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2 + %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7 + ret <9 x float> %tmp3 +} + +define <10 x float> @insertelement_to_v10f32_undef() nounwind { +; GCN-LABEL: insertelement_to_v10f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 2.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s13 +; GCN-NEXT: s_setpc_b64 s[30:31] + %tmp = load <10 x float>, <10 x float> addrspace(4)* undef + %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0 + ret <10 x float> %tmp1 +} + +define <11 x float> @insertelement_to_v11f32_undef() nounwind { +; GCN-LABEL: insertelement_to_v11f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s13 +; GCN-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NEXT: s_setpc_b64 s[30:31] + %tmp = load <11 x float>, <11 x float> addrspace(4)* undef + %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0 + ret <11 x float> %tmp1 +} + +define <12 x float> @insertelement_to_v12f32_undef() nounwind { +; GCN-LABEL: insertelement_to_v12f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 4.0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s13 +; GCN-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NEXT: s_setpc_b64 s[30:31] + %tmp = load <12 x float>, <12 x float> addrspace(4)* undef + %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0 + ret <12 x float> %tmp1 +} + +define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -441,7 +541,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s8, s[4:5], 0x8 @@ -494,7 +594,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s8, s[4:5], 0x8 @@ -555,7 +655,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -606,7 +706,249 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v9f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_movreld_b32_e32 v0, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v9f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_movreld_b32_e32 v0, v9 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm + %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b + store <9 x float> %vecins, <9 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v10f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_movreld_b32_e32 v0, v10 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_movreld_b32_e32 v0, v10 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b + store <10 x float> %vecins, <10 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v11f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_movreld_b32_e32 v0, v11 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v11f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_movreld_b32_e32 v0, v11 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b + store <11 x float> %vecins, <11 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v12f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_movreld_b32_e32 v0, v12 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v12f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s11 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_movreld_b32_e32 v0, v12 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b + store <12 x float> %vecins, <12 x float> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16f32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -677,7 +1019,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -718,7 +1060,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s8, s[4:5], 0x8 @@ -763,7 +1105,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { ; SI-LABEL: dynamic_insertelement_v4i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 @@ -816,7 +1158,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 @@ -865,7 +1207,241 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v9i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_movreld_b32_e32 v0, 5 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v9i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_movreld_b32_e32 v0, 5 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm + %vecins = insertelement <9 x i32> %a, i32 5, i32 %b + store <9 x i32> %vecins, <9 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v10i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s7 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_movreld_b32_e32 v0, 5 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s6 +; VI-NEXT: v_mov_b32_e32 v9, s7 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_movreld_b32_e32 v0, 5 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <10 x i32> %a, i32 5, i32 %b + store <10 x i32> %vecins, <10 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v11i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_movreld_b32_e32 v0, 5 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v11i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_movreld_b32_e32 v0, 5 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <11 x i32> %a, i32 5, i32 %b + store <11 x i32> %vecins, <11 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind { +; SI-LABEL: dynamic_insertelement_v12i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18 +; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: v_movreld_b32_e32 v0, 5 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_insertelement_v12i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60 +; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, s8 +; VI-NEXT: v_mov_b32_e32 v9, s9 +; VI-NEXT: v_mov_b32_e32 v10, s10 +; VI-NEXT: v_mov_b32_e32 v11, s11 +; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: v_movreld_b32_e32 v0, 5 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: s_endpgm + %vecins = insertelement <12 x i32> %a, i32 5, i32 %b + store <12 x i32> %vecins, <12 x i32> addrspace(1)* %out, align 32 + ret void +} + +define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 @@ -934,7 +1510,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -973,7 +1549,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1023,7 +1599,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x13 @@ -1065,7 +1641,7 @@ ; FIXME: post legalize i16 and i32 shifts aren't merged because of ; isTypeDesirableForOp in SimplifyDemandedBits -define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x13 @@ -1110,7 +1686,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x13 @@ -1149,7 +1725,7 @@ ret void } -define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { +define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 @@ -1201,7 +1777,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v16i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 @@ -1410,24 +1986,24 @@ ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that ; the compiler doesn't crash. -define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { +define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dword s6, s[4:5], 0x4 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s6, 0 -; SI-NEXT: s_cbranch_scc0 .LBB30_4 +; SI-NEXT: s_cbranch_scc0 .LBB42_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_load_dword s7, s[2:3], 0x1 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB30_3 -; SI-NEXT: .LBB30_2: ; %if +; SI-NEXT: s_cbranch_vccnz .LBB42_3 +; SI-NEXT: .LBB42_2: ; %if ; SI-NEXT: s_load_dword s7, s[2:3], 0x0 -; SI-NEXT: .LBB30_3: ; %endif +; SI-NEXT: .LBB42_3: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -1435,8 +2011,8 @@ ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB30_4: -; SI-NEXT: s_branch .LBB30_2 +; SI-NEXT: .LBB42_4: +; SI-NEXT: s_branch .LBB42_2 ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry @@ -1444,14 +2020,14 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u32 s6, 0 -; VI-NEXT: s_cbranch_scc0 .LBB30_4 +; VI-NEXT: s_cbranch_scc0 .LBB42_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_load_dword s7, s[2:3], 0x4 -; VI-NEXT: s_cbranch_execnz .LBB30_3 -; VI-NEXT: .LBB30_2: ; %if +; VI-NEXT: s_cbranch_execnz .LBB42_3 +; VI-NEXT: .LBB42_2: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s7, s[2:3], 0x0 -; VI-NEXT: .LBB30_3: ; %endif +; VI-NEXT: .LBB42_3: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1459,8 +2035,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB30_4: -; VI-NEXT: s_branch .LBB30_2 +; VI-NEXT: .LBB42_4: +; VI-NEXT: s_branch .LBB42_2 entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 %1 = icmp eq i32 %a, 0 @@ -1483,7 +2059,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s8, s[4:5], 0x18 @@ -1530,7 +2106,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s8, s[4:5], 0x8 @@ -1577,7 +2153,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x10 @@ -1638,7 +2214,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { +define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x10 @@ -1709,7 +2285,7 @@ ret void } -define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { +define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s6, s[4:5], 0x20 Index: llvm/test/CodeGen/AMDGPU/ipra-regmask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ipra-regmask.ll +++ llvm/test/CodeGen/AMDGPU/ipra-regmask.ll @@ -1,19 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s ; Make sure the expected regmask is generated for sub/superregisters. -; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} +; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} define void @csr() #0 { call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0 ret void } -; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @subregs_for_super() #0 { call void asm sideeffect "", "~{v0},~{v1}"() #0 ret void } -; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @clobbered_reg_with_sub() #0 { call void asm sideeffect "", "~{v[0:1]}"() #0 ret void @@ -44,3 +45,5 @@ i8* bitcast (void ()* @vcc to i8*)] attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} Index: llvm/test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -2286,57 +2286,56 @@ ; ; VI-LABEL: v5i64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_add_u32 s12, s8, 32 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: s_addc_u32 s13, s9, 0 +; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s13 +; VI-NEXT: s_add_u32 s4, s8, 16 +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s5, s9, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i64_arg: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i64_arg: @@ -2429,57 +2428,56 @@ ; ; VI-LABEL: v5f64_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84 +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: s_add_u32 s8, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_add_u32 s12, s8, 32 +; VI-NEXT: v_mov_b32_e32 v1, s10 +; VI-NEXT: s_addc_u32 s13, s9, 0 +; VI-NEXT: v_mov_b32_e32 v3, s12 +; VI-NEXT: v_mov_b32_e32 v2, s11 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s13 +; VI-NEXT: s_add_u32 s4, s8, 16 +; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_addc_u32 s5, s9, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5f64_arg: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[2:3] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5f64_arg: Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -32,9 +32,9 @@ } ; GCN-LABEL: {{^}}sample_d_3d: -; GFX1010-NSA: image_sample_d v[0:3], v[7:22], +; GFX1010-NSA: image_sample_d v[0:3], v[7:15], ; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1], -; GFX11-NSA: image_sample_d v[0:3], v[7:22], +; GFX11-NSA: image_sample_d v[0:3], v[7:15], define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) { main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1568,19 +1568,19 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { ; VERDE-LABEL: sample_c_d_o_2darray_V1: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da +; VERDE-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_o_2darray_V1: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da +; GFX6789-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: sample_c_d_o_2darray_V1: ; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10PLUS-NEXT: image_sample_c_d_o v0, v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX10PLUS-NEXT: ; return to shader part epilog main_body: @@ -1593,7 +1593,7 @@ ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: v_mov_b32_e32 v9, 0 ; VERDE-NEXT: v_mov_b32_e32 v10, v9 -; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da +; VERDE-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da ; VERDE-NEXT: s_mov_b32 s15, 0xf000 ; VERDE-NEXT: s_mov_b32 s14, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) @@ -1608,7 +1608,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v12, v11 ; GFX6789-NEXT: v_mov_b32_e32 v9, v11 ; GFX6789-NEXT: v_mov_b32_e32 v10, v12 -; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da +; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 tfe da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: global_store_dword v11, v10, s[12:13] @@ -1621,7 +1621,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; GFX10-NEXT: v_mov_b32_e32 v9, v11 ; GFX10-NEXT: v_mov_b32_e32 v10, v12 -; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: global_store_dword v11, v10, s[12:13] @@ -1633,7 +1633,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_mov_b32_e32 v12, v11 ; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12 -; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v9 ; GFX11-NEXT: global_store_b32 v11, v10, s[12:13] @@ -1650,19 +1650,19 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { ; VERDE-LABEL: sample_c_d_o_2darray_V2: ; VERDE: ; %bb.0: ; %main_body -; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da +; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_o_2darray_V2: ; GFX6789: ; %bb.0: ; %main_body -; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da +; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2: ; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10PLUS-NEXT: image_sample_c_d_o v[0:1], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX10PLUS-NEXT: ; return to shader part epilog main_body: @@ -1676,7 +1676,7 @@ ; VERDE-NEXT: v_mov_b32_e32 v9, 0 ; VERDE-NEXT: v_mov_b32_e32 v10, v9 ; VERDE-NEXT: v_mov_b32_e32 v11, v9 -; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da +; VERDE-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: v_mov_b32_e32 v0, v9 ; VERDE-NEXT: v_mov_b32_e32 v1, v10 @@ -1688,7 +1688,7 @@ ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 ; GFX6789-NEXT: v_mov_b32_e32 v11, v9 -; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da +; GFX6789-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 tfe da ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: v_mov_b32_e32 v1, v10 @@ -1700,7 +1700,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; GFX10-NEXT: v_mov_b32_e32 v11, v9 -; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; GFX10-NEXT: v_mov_b32_e32 v1, v10 @@ -1712,7 +1712,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v9, 0 ; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, v11 ; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll @@ -186,7 +186,7 @@ } ; GCN-LABEL: {{^}}sample_c_d_cl_o_2d: -; GCN: image_sample_c_d_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}} +; GCN: image_sample_c_d_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}} define amdgpu_ps <4 x float> @sample_c_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -250,7 +250,7 @@ } ; GCN-LABEL: {{^}}sample_c_cd_cl_o_2d: -; GCN: image_sample_c_cd_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}} +; GCN: image_sample_c_cd_cl_o v[0:3], v[0:8], s[0:7], s[8:11] dmask:0xf{{$}} define amdgpu_ps <4 x float> @sample_c_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -20,7 +20,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -90,7 +90,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray: ; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -128,7 +128,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, s8 ; GFX10-NEXT: s_mov_b32 s15, s13 ; GFX10-NEXT: s_mov_b32 s13, s11 -; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 +; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[12:15] a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; @@ -182,7 +182,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -208,7 +208,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -370,7 +370,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -396,7 +396,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm @@ -461,7 +461,7 @@ ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm @@ -484,7 +484,7 @@ ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/load-constant-i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -9,7 +9,7 @@ ; GCN: s_load_dword s{{[0-9]+}} ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load i32, i32 addrspace(4)* %in store i32 %ld, i32 addrspace(1)* %out @@ -20,7 +20,7 @@ ; GCN: s_load_dwordx2 ; EG: VTX_READ_64 -define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -31,7 +31,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(4)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -42,7 +42,7 @@ ; GCN: s_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -54,13 +54,69 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}constant_load_v9i32: +; GCN: s_load_dword +; GCN: s_load_dwordx8 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 +; EG: VTX_READ_32 +define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { +entry: + %ld = load <9 x i32>, <9 x i32> addrspace(4)* %in + store <9 x i32> %ld, <9 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v10i32: +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx8 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { +entry: + %ld = load <10 x i32>, <10 x i32> addrspace(4)* %in + store <10 x i32> %ld, <10 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v11i32: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx8 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { +entry: + %ld = load <11 x i32>, <11 x i32> addrspace(4)* %in + store <11 x i32> %ld, <11 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v12i32: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx8 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { +entry: + %ld = load <12 x i32>, <12 x i32> addrspace(4)* %in + store <12 x i32> %ld, <12 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}constant_load_v16i32: ; GCN: s_load_dwordx16 @@ -68,7 +124,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -83,7 +139,7 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY ; EG: CF_END ; EG: VTX_READ_32 -define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load i32, i32 addrspace(4)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -100,7 +156,7 @@ ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load i32, i32 addrspace(4)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -110,7 +166,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64: ; GCN: s_load_dword ; GCN: store_dwordx2 -define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -121,7 +177,7 @@ ; GCN: s_load_dword s[[LO:[0-9]+]] ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31 ; GCN: store_dwordx2 -define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(4)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -131,7 +187,7 @@ ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64: ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -145,7 +201,7 @@ ; GCN-DAG: s_ashr_i32 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(4)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -157,7 +213,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -174,7 +230,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(4)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -193,7 +249,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-SA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -221,7 +277,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(4)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -242,7 +298,7 @@ ; GCN: store_dwordx4 ; GCN: store_dwordx4 ; GCN: store_dwordx4 -define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -269,7 +325,7 @@ ; GCN-HSA: {{flat|global}}_store_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(4)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -321,7 +377,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -372,7 +428,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -424,7 +480,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @constant_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(4)* %in) #0 { +define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(4)* %in store <32 x i32> %ld, <32 x i32> addrspace(1)* %out ret void Index: llvm/test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -10,7 +10,7 @@ ; GCN-HSA: flat_load_dword ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load float, float addrspace(1)* %in store float %tmp0, float addrspace(1)* %out @@ -22,7 +22,7 @@ ; GCN-HSA: flat_load_dwordx2 ; R600: VTX_READ_64 -define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in store <2 x float> %tmp0, <2 x float> addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCNX3-HSA: flat_load_dwordx3 ; R600: VTX_READ_128 -define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in store <3 x float> %tmp0, <3 x float> addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-HSA: flat_load_dwordx4 ; R600: VTX_READ_128 -define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in store <4 x float> %tmp0, <4 x float> addrspace(1)* %out @@ -62,13 +62,89 @@ ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in store <8 x float> %tmp0, <8 x float> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}global_load_v9f32: +; GCN-NOHSA: buffer_load_dword +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dword +; GCN-HSA: flat_load_dwordx4 + +; R600: VTX_READ_128 +; R600: VTX_READ_32 +; R600: VTX_READ_128 +define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in + store <9 x float> %tmp0, <9 x float> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}global_load_v10f32: +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx2 + +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in + store <10 x float> %tmp0, <10 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_load_v11f32: +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx3 + +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in + store <11 x float> %tmp0, <11 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_load_v12f32: +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 + +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in + store <12 x float> %tmp0, <12 x float> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}global_load_v16f32: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 @@ -84,7 +160,7 @@ ; R600: VTX_READ_128 ; R600: VTX_READ_128 ; R600: VTX_READ_128 -define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in store <16 x float> %tmp0, <16 x float> addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -10,7 +10,7 @@ ; GCN-HSA: {{flat|global}}_load_dword ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 -define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load i32, i32 addrspace(1)* %in store i32 %ld, i32 addrspace(1)* %out @@ -22,7 +22,7 @@ ; GCN-HSA: {{flat|global}}_load_dwordx2 ; EG: VTX_READ_64 -define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in store <2 x i32> %ld, <2 x i32> addrspace(1)* %out @@ -35,7 +35,7 @@ ; GCNX3-HSA: {{flat|global}}_load_dwordx3 ; EG: VTX_READ_128 -define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in store <3 x i32> %ld, <3 x i32> addrspace(1)* %out @@ -47,7 +47,7 @@ ; GCN-HSA: {{flat|global}}_load_dwordx4 ; EG: VTX_READ_128 -define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in store <4 x i32> %ld, <4 x i32> addrspace(1)* %out @@ -62,13 +62,73 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in store <8 x i32> %ld, <8 x i32> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}global_load_v9i32: +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dword +define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %ld = load <9 x i32>, <9 x i32> addrspace(1)* %in + store <9 x i32> %ld, <9 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_load_v10i32: +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx2 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx2 +define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %ld = load <10 x i32>, <10 x i32> addrspace(1)* %in + store <10 x i32> %ld, <10 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_load_v11i32: +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx3 +define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %ld = load <11 x i32>, <11 x i32> addrspace(1)* %in + store <11 x i32> %ld, <11 x i32> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}global_load_v12i32: +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +; GCN-HSA: {{flat|global}}_load_dwordx4 +define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +entry: + %ld = load <12 x i32>, <12 x i32> addrspace(1)* %in + store <12 x i32> %ld, <12 x i32> addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}global_load_v16i32: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-NOHSA: buffer_load_dwordx4 @@ -84,7 +144,7 @@ ; EG: VTX_READ_128 ; EG: VTX_READ_128 ; EG: VTX_READ_128 -define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in store <16 x i32> %ld, <16 x i32> addrspace(1)* %out @@ -100,7 +160,7 @@ ; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = zext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -119,7 +179,7 @@ ; EG: VTX_READ_32 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 -define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = sext i32 %ld to i64 store i64 %ext, i64 addrspace(1)* %out @@ -132,7 +192,7 @@ ; GCN-HSA: {{flat|global}}_load_dword ; GCN-HSA: {{flat|global}}_store_dwordx2 -define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = zext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -145,7 +205,7 @@ ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-NOHSA: buffer_store_dwordx2 v[[[LO]]:[[HI]]] ; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]] -define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in %ext = sext <1 x i32> %ld to <1 x i64> store <1 x i64> %ext, <1 x i64> addrspace(1)* %out @@ -158,7 +218,7 @@ ; GCN-HSA: {{flat|global}}_load_dwordx2 ; GCN-HSA: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = zext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -174,7 +234,7 @@ ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in %ext = sext <2 x i32> %ld to <2 x i64> store <2 x i64> %ext, <2 x i64> addrspace(1)* %out @@ -189,7 +249,7 @@ ; GCN-HSA: {{flat|global}}_load_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = zext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -210,7 +270,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in %ext = sext <4 x i32> %ld to <4 x i64> store <4 x i64> %ext, <4 x i64> addrspace(1)* %out @@ -233,7 +293,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = zext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -265,7 +325,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in %ext = sext <8 x i32> %ld to <8 x i64> store <8 x i64> %ext, <8 x i64> addrspace(1)* %out @@ -311,7 +371,7 @@ ; GCN-DAG: v_ashrrev_i32 ; GCN-NOHSA-DAG: buffer_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = sext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -346,7 +406,7 @@ ; GCN-HSA: {{flat|global}}_store_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 ; GCN-HSA: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in %ext = zext <16 x i32> %ld to <16 x i64> store <16 x i64> %ext, <16 x i64> addrspace(1)* %out @@ -446,7 +506,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = sext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -513,7 +573,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in %ext = zext <32 x i32> %ld to <32 x i64> store <32 x i64> %ext, <32 x i64> addrspace(1)* %out @@ -581,7 +641,7 @@ ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 ; GCN-HSA-DAG: {{flat|global}}_store_dwordx4 -define amdgpu_kernel void @global_load_v32i32(<32 x i32> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 { +define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in store <32 x i32> %ld, <32 x i32> addrspace(1)* %out ret void Index: llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir +++ llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx11.mir @@ -714,7 +714,7 @@ # GFX11-LABEL: name: image_sample_c_d_cl_o_merged_v1v3 -# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V16_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) +# GFX11: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V9_gfx11 %5, %3, %2, 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) # GFX11: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX11: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -726,9 +726,9 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_512 = IMPLICIT_DEF - %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V16_gfx11 %5:vreg_512, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_288 = IMPLICIT_DEF + %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_gfx11 %5:vreg_288, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- Index: llvm/test/CodeGen/AMDGPU/select.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select.f16.ll +++ llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -5,14 +5,19 @@ define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s26, -1 +; SI-NEXT: s_mov_b32 s27, 0xe8f000 +; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 @@ -34,6 +39,7 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 @@ -46,14 +52,19 @@ ; ; VI-LABEL: select_f16: ; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s26, -1 +; VI-NEXT: s_mov_b32 s27, 0xe80000 +; VI-NEXT: s_add_u32 s24, s24, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s6 ; VI-NEXT: s_mov_b32 s17, s7 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s8 ; VI-NEXT: s_mov_b32 s21, s9 @@ -75,6 +86,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_addc_u32 s25, s25, 0 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -420,14 +432,19 @@ define amdgpu_kernel void @select_v2f16( ; SI-LABEL: select_v2f16: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s26, -1 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s27, 0xe8f000 +; SI-NEXT: s_add_u32 s24, s24, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s6 ; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s20, s8 ; SI-NEXT: s_mov_b32 s21, s9 @@ -445,6 +462,7 @@ ; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -474,8 +492,13 @@ ; ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_mov_b32 s26, -1 +; VI-NEXT: s_mov_b32 s27, 0xe80000 +; VI-NEXT: s_add_u32 s24, s24, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s14, s2 @@ -499,6 +522,7 @@ ; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_addc_u32 s25, s25, 0 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) Index: llvm/test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -67,14 +67,19 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-LABEL: madak_f16_use_2: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_mov_b32 s23, 0xe8f000 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_add_u32 s20, s20, s3 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s8 ; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s18, s2 ; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s11 @@ -91,6 +96,7 @@ ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -106,14 +112,19 @@ ; ; VI-LABEL: madak_f16_use_2: ; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s22, -1 +; VI-NEXT: s_mov_b32 s23, 0xe80000 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 +; VI-NEXT: s_add_u32 s20, s20, s3 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s16, s8 ; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s18, s2 ; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s8, s10 ; VI-NEXT: s_mov_b32 s9, s11 @@ -130,6 +141,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, 0x4900 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_addc_u32 s21, s21, 0 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900 Index: llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir @@ -8,11 +8,11 @@ ; GCN-LABEL: name: waitcnt-check-inorder ; GCN: S_WAITCNT 0 ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") S_ENDPGM 0 ... --- @@ -22,11 +22,11 @@ ; GCN-LABEL: name: waitcnt-check-vs-vmem ; GCN: S_WAITCNT 0 ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") ; GCN-NEXT: S_WAITCNT 16240 ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -37,11 +37,11 @@ ; GCN-LABEL: name: waitcnt-check-vs-mimg-samp ; GCN: S_WAITCNT 0 ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") ; GCN-NEXT: S_WAITCNT 16240 ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) S_ENDPGM 0 ... @@ -54,10 +54,10 @@ ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec ; GCN-NEXT: S_WAITCNT 16240 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") ; GCN-NEXT: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") S_ENDPGM 0 ... --- @@ -69,9 +69,9 @@ ; GCN-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ; GCN-NEXT: S_WAITCNT 16240 - ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") ; GCN-NEXT: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) - $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") + $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec :: (dereferenceable load (s128) from custom "ImageResource") S_ENDPGM 0 ... Index: llvm/test/MC/AMDGPU/gfx1013.s =================================================================== --- llvm/test/MC/AMDGPU/gfx1013.s +++ llvm/test/MC/AMDGPU/gfx1013.s @@ -1,28 +1,28 @@ // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1013 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck %s -image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11] +image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] // CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x02,0x00] -image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] a16 -// CHECK: [0x01,0x9f,0x9c,0xf1,0xf0,0x05,0x02,0x40] +image_bvh64_intersect_ray v[5:8], v[247:255], s[8:11] a16 +// CHECK: [0x01,0x9f,0x9c,0xf1,0xf7,0x05,0x02,0x40] -image_bvh64_intersect_ray v[5:8], v[1:16], ttmp[12:15] +image_bvh64_intersect_ray v[5:8], v[1:12], ttmp[12:15] // CHECK: [0x01,0x9f,0x9c,0xf1,0x01,0x05,0x1e,0x00] image_bvh64_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40, v42], s[12:15] -// CHECK: encoding: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00] +// CHECK: [0x07,0x9f,0x9c,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x2a,0x00] -image_bvh_intersect_ray v[252:255], v[1:16], s[8:11] +image_bvh_intersect_ray v[252:255], v[1:11], s[8:11] // CHECK: [0x01,0x9f,0x98,0xf1,0x01,0xfc,0x02,0x00] image_bvh_intersect_ray v[5:8], v[248:255], s[8:11] a16 // CHECK: [0x01,0x9f,0x98,0xf1,0xf8,0x05,0x02,0x40] -image_bvh_intersect_ray v[5:8], v[1:16], ttmp[12:15] +image_bvh_intersect_ray v[5:8], v[1:11], ttmp[12:15] // CHECK: [0x01,0x9f,0x98,0xf1,0x01,0x05,0x1e,0x00] image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] -// CHECK: encoding: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00] +// CHECK: [0x07,0x9f,0x98,0xf1,0x32,0x27,0x03,0x00,0x2e,0x17,0x11,0x10,0x0f,0x15,0x14,0x13,0x25,0x28,0x00,0x00] image_msaa_load v[5:6], v[1:4], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY // CHECK: [0x39,0x03,0x00,0xf0,0x01,0x05,0x02,0x00] Index: llvm/test/MC/AMDGPU/gfx1030_new.s =================================================================== --- llvm/test/MC/AMDGPU/gfx1030_new.s +++ llvm/test/MC/AMDGPU/gfx1030_new.s @@ -84,16 +84,16 @@ v_fmac_legacy_f32 v0, s1, 2.0 // GFX10: encoding: [0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00] -image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] // GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00] image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 // GFX10: encoding: [0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40] -image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] // GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00] -image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 // GFX10: encoding: [0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40] image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] Index: llvm/test/MC/AMDGPU/gfx10_asm_mimg.s =================================================================== --- llvm/test/MC/AMDGPU/gfx10_asm_mimg.s +++ llvm/test/MC/AMDGPU/gfx10_asm_mimg.s @@ -298,8 +298,8 @@ image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v0, v20, v21], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x14,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x00,0x14,0x15] -image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D -; GFX10: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03] +image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D +; GFX10: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x07,0x88,0xf0,0x20,0x40,0x21,0x03] image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE ; GFX10: image_sample_d v[64:66], [v32, v16, v8, v4, v2, v1, v5], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x1c,0x07,0x88,0xf0,0x20,0x40,0x21,0x03,0x10,0x08,0x04,0x02,0x01,0x05,0x00,0x00] Index: llvm/test/MC/AMDGPU/gfx10_unsupported.s =================================================================== --- llvm/test/MC/AMDGPU/gfx10_unsupported.s +++ llvm/test/MC/AMDGPU/gfx10_unsupported.s @@ -761,10 +761,10 @@ global_store_dword_addtid v1, off offset:16 glc slc dlc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16 +image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16 // GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -image_bvh_intersect_ray v[252:255], v[1:16], s[8:11] +image_bvh_intersect_ray v[252:255], v[1:11], s[8:11] // GFX1010: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY Index: llvm/test/MC/AMDGPU/gfx11_asm_mimg.s =================================================================== --- llvm/test/MC/AMDGPU/gfx11_asm_mimg.s +++ llvm/test/MC/AMDGPU/gfx11_asm_mimg.s @@ -1248,23 +1248,23 @@ image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe // GFX11: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00] -image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11] +image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] // GFX11: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00] -image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] -// GFX11: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00] +image_bvh64_intersect_ray v[5:8], v[244:255], s[8:11] +// GFX11: [0x80,0x8f,0x68,0xf0,0xf4,0x05,0x02,0x00] -image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16 +image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16 // GFX11: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00] -image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16 -// GFX11: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00] +image_bvh64_intersect_ray v[252:255], v[247:255], ttmp[12:15] a16 +// GFX11: [0x80,0x8f,0x69,0xf0,0xf7,0xfc,0x1e,0x00] -image_bvh_intersect_ray v[5:8], v[1:16], s[8:11] +image_bvh_intersect_ray v[5:8], v[1:11], s[8:11] // GFX11: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00] -image_bvh_intersect_ray v[5:8], v[240:255], s[8:11] -// GFX11: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00] +image_bvh_intersect_ray v[5:8], v[245:255], s[8:11] +// GFX11: [0x80,0x8f,0x64,0xf0,0xf5,0x05,0x02,0x00] image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16 // GFX11: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00] @@ -3264,17 +3264,17 @@ image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c] -image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c] -image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 -// GFX11: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c] +image_sample_c_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +// GFX11: [0x08,0x03,0x85,0xf0,0xf1,0x05,0x02,0x0c] image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D // GFX11: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c] @@ -3336,17 +3336,17 @@ image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c] -image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_c_d_cl v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0x10,0xf1,0xf1,0x05,0x02,0x0c] -image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 -// GFX11: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_c_d_cl v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +// GFX11: [0x08,0x03,0x11,0xf1,0xf1,0x05,0x02,0x0c] image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D // GFX11: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c] @@ -3360,11 +3360,11 @@ image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe // GFX11: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c] -image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 // GFX11: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 -// GFX11: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c] +image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +// GFX11: [0x0c,0x03,0x12,0xf1,0xf1,0xff,0x02,0x0c] image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 // GFX11: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c] @@ -3384,11 +3384,11 @@ image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe // GFX11: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c] -image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 // GFX11: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 -// GFX11: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c] +image_sample_c_d_cl v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +// GFX11: [0x14,0x04,0x12,0xf1,0xf1,0xff,0x02,0x0c] image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 // GFX11: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64] @@ -3408,11 +3408,11 @@ image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c] -image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_c_d_cl_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0x50,0xf1,0xf1,0x05,0x02,0x0c] image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c] @@ -3480,23 +3480,23 @@ image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D +image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D // GFX11: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D -// GFX11: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c] +image_sample_c_d_cl_o v[254:255], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D +// GFX11: [0x04,0x03,0x28,0xf1,0xf1,0xfe,0x02,0x0c] image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe // GFX11: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c] @@ -3504,10 +3504,10 @@ image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe // GFX11: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c] -image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 // GFX11: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 // GFX11: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c] image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 @@ -3528,10 +3528,10 @@ image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe // GFX11: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c] -image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 // GFX11: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 // GFX11: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c] image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 @@ -3552,10 +3552,10 @@ image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c] -image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c] -image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c] image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 @@ -3696,16 +3696,16 @@ image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c] -image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c] +image_sample_c_d_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0xac,0xf0,0xf1,0x05,0x02,0x0c] -image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c] image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D @@ -3720,11 +3720,11 @@ image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe // GFX11: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c] -image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 // GFX11: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 -// GFX11: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c] +image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +// GFX11: [0x0c,0x03,0xae,0xf0,0xf1,0xff,0x02,0x0c] image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 // GFX11: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c] @@ -3744,11 +3744,11 @@ image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe // GFX11: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c] -image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 // GFX11: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 -// GFX11: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c] +image_sample_c_d_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +// GFX11: [0x14,0x04,0xae,0xf0,0xf1,0xff,0x02,0x0c] image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 // GFX11: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64] @@ -3768,11 +3768,11 @@ image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c] -image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c] -image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c] +image_sample_c_d_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0xf0,0xf0,0xf1,0x05,0x02,0x0c] image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c] @@ -4344,11 +4344,11 @@ image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c] -image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c] -image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c] +image_sample_d v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0x70,0xf0,0xf1,0x05,0x02,0x0c] image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c] @@ -4416,10 +4416,10 @@ image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c] -image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c] image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 @@ -4560,17 +4560,17 @@ image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c] -image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_d_cl_o v[5:6], v[241:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0x1c,0xf1,0xf1,0x05,0x02,0x0c] -image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 -// GFX11: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_d_cl_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +// GFX11: [0x08,0x03,0x1d,0xf1,0xf1,0x05,0x02,0x0c] image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D // GFX11: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c] @@ -4584,11 +4584,11 @@ image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe // GFX11: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c] -image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 // GFX11: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 -// GFX11: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c] +image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 +// GFX11: [0x0c,0x03,0x1e,0xf1,0xf1,0xff,0x02,0x0c] image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 // GFX11: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c] @@ -4608,11 +4608,11 @@ image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe // GFX11: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c] -image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 // GFX11: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 -// GFX11: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c] +image_sample_d_cl_o v255, v[241:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 +// GFX11: [0x14,0x04,0x1e,0xf1,0xf1,0xff,0x02,0x0c] image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 // GFX11: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64] @@ -4632,11 +4632,11 @@ image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c] -image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c] -image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D -// GFX11: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c] +image_sample_d_cl_o_g16 v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +// GFX11: [0x08,0x03,0x54,0xf1,0xf1,0x05,0x02,0x0c] image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c] @@ -4776,17 +4776,17 @@ image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 // GFX11: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c] -image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c] -image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D +image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D // GFX11: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c] -image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 // GFX11: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c] -image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 -// GFX11: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c] +image_sample_d_o v[5:6], v[241:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 +// GFX11: [0x08,0x03,0x99,0xf0,0xf1,0x05,0x02,0x0c] image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D // GFX11: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c] Index: llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s =================================================================== --- llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s +++ llvm/test/MC/AMDGPU/gfx11_asm_mimg_features.s @@ -163,8 +163,8 @@ image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D // GFX11: image_sample_d v[64:66], v[32:39], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] -image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D -// GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] +image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D +// GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY // GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00] @@ -286,17 +286,17 @@ image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY // GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00] -image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00] +image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] +// GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00] image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 // GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00] -image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] -// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00] +image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] +// GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00] -image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 -// GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00] +image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 +// GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00] image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] // GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f] Index: llvm/test/MC/AMDGPU/gfx7_asm_mimg.s =================================================================== --- llvm/test/MC/AMDGPU/gfx7_asm_mimg.s +++ llvm/test/MC/AMDGPU/gfx7_asm_mimg.s @@ -1848,7 +1848,7 @@ image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] -image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -1947,7 +1947,7 @@ image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2619,7 +2619,7 @@ image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2712,7 +2712,7 @@ image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -3381,7 +3381,7 @@ image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -3474,7 +3474,7 @@ image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4137,7 +4137,7 @@ image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6069,7 +6069,7 @@ image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -6168,7 +6168,7 @@ image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6264,7 +6264,7 @@ image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6357,7 +6357,7 @@ image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6453,7 +6453,7 @@ image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6546,7 +6546,7 @@ image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6639,7 +6639,7 @@ image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6732,7 +6732,7 @@ image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm Index: llvm/test/MC/AMDGPU/gfx8_asm_mimg.s =================================================================== --- llvm/test/MC/AMDGPU/gfx8_asm_mimg.s +++ llvm/test/MC/AMDGPU/gfx8_asm_mimg.s @@ -1773,7 +1773,7 @@ image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] -image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -1875,7 +1875,7 @@ image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2568,7 +2568,7 @@ image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2664,7 +2664,7 @@ image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -3354,7 +3354,7 @@ image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -3450,7 +3450,7 @@ image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4134,7 +4134,7 @@ image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4230,7 +4230,7 @@ image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6156,7 +6156,7 @@ image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -6258,7 +6258,7 @@ image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6357,7 +6357,7 @@ image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6453,7 +6453,7 @@ image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6552,7 +6552,7 @@ image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6648,7 +6648,7 @@ image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6744,7 +6744,7 @@ image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6840,7 +6840,7 @@ image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm Index: llvm/test/MC/AMDGPU/gfx9_asm_mimg.s =================================================================== --- llvm/test/MC/AMDGPU/gfx9_asm_mimg.s +++ llvm/test/MC/AMDGPU/gfx9_asm_mimg.s @@ -1851,7 +1851,7 @@ image_sample_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] -image_sample_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x88,0xf0,0x01,0x05,0x62,0x00] image_sample_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -1956,7 +1956,7 @@ image_sample_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0x8c,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2667,7 +2667,7 @@ image_sample_c_d v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -2766,7 +2766,7 @@ image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -3477,7 +3477,7 @@ image_sample_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xc8,0xf0,0x01,0x05,0x62,0x00] image_sample_d_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -3576,7 +3576,7 @@ image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xcc,0xf0,0x01,0x05,0x62,0x00] -image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xcc,0xf0,0x01,0x05,0x62,0x00] image_sample_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4278,7 +4278,7 @@ image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xe8,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xe8,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -4377,7 +4377,7 @@ image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xec,0xf0,0x01,0x05,0x62,0x00] -image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xec,0xf0,0x01,0x05,0x62,0x00] image_sample_c_d_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6399,7 +6399,7 @@ image_sample_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd v5, v[1:9], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 @@ -6504,7 +6504,7 @@ image_sample_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6606,7 +6606,7 @@ image_sample_c_cd v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6705,7 +6705,7 @@ image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xac,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xac,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -6807,7 +6807,7 @@ image_sample_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb0,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_o v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm @@ -6906,7 +6906,7 @@ image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb4,0xf1,0x01,0x05,0x62,0x00] -image_sample_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb4,0xf1,0x01,0x05,0x62,0x00] image_sample_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -7005,7 +7005,7 @@ image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xb8,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xb8,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm @@ -7104,7 +7104,7 @@ image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x0 // CHECK: [0x00,0x00,0xbc,0xf1,0x01,0x05,0x62,0x00] -image_sample_c_cd_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x1 +image_sample_c_cd_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x1 // CHECK: [0x00,0x01,0xbc,0xf1,0x01,0x05,0x62,0x00] image_sample_c_cd_cl_o v5, v[1:8], s[8:15], s[12:15] dmask:0x1 unorm Index: llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt +++ llvm/test/MC/Disassembler/AMDGPU/gfx1030_new.txt @@ -75,16 +75,16 @@ # GFX10: v_fmac_legacy_f32_e64 v0, s1, 2.0 0x00,0x00,0x06,0xd5,0x01,0xe8,0x01,0x00 -# GFX10: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] +# GFX10: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] 0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x00 # GFX10: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 0x01,0x9f,0x98,0xf1,0x09,0x04,0x01,0x40 -# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] 0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x00 -# GFX10: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 +# GFX10: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 0x01,0x9f,0x9c,0xf1,0x09,0x04,0x01,0x40 # GFX10: image_bvh_intersect_ray v[39:42], [v50, v46, v23, v17, v16, v15, v21, v20, v19, v37, v40], s[12:15] Index: llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt +++ llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg.txt @@ -1248,22 +1248,22 @@ # GFX11: image_atomic_xor v[254:255], v[254:255], ttmp[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc slc dlc a16 lwe ; encoding: [0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00] 0x98,0x73,0x51,0xf0,0xfe,0xfe,0x5d,0x00 -# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00] +# GFX11: image_bvh64_intersect_ray v[5:8], v[1:12], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00] 0x80,0x8f,0x68,0xf0,0x01,0x05,0x02,0x00 -# GFX11: image_bvh64_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00] +# GFX11: image_bvh64_intersect_ray v[5:8], v[240:251], s[8:11] ; encoding: [0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00] 0x80,0x8f,0x68,0xf0,0xf0,0x05,0x02,0x00 -# GFX11: image_bvh64_intersect_ray v[5:8], v[1:16], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00] +# GFX11: image_bvh64_intersect_ray v[5:8], v[1:9], s[100:103] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00] 0x80,0x8f,0x69,0xf0,0x01,0x05,0x19,0x00 -# GFX11: image_bvh64_intersect_ray v[252:255], v[240:255], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00] +# GFX11: image_bvh64_intersect_ray v[252:255], v[240:248], ttmp[12:15] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00] 0x80,0x8f,0x69,0xf0,0xf0,0xfc,0x1e,0x00 -# GFX11: image_bvh_intersect_ray v[5:8], v[1:16], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00] +# GFX11: image_bvh_intersect_ray v[5:8], v[1:11], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00] 0x80,0x8f,0x64,0xf0,0x01,0x05,0x02,0x00 -# GFX11: image_bvh_intersect_ray v[5:8], v[240:255], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00] +# GFX11: image_bvh_intersect_ray v[5:8], v[240:250], s[8:11] ; encoding: [0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00] 0x80,0x8f,0x64,0xf0,0xf0,0x05,0x02,0x00 # GFX11: image_bvh_intersect_ray v[5:8], v[1:8], s[100:103] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x01,0x05,0x19,0x00] @@ -3264,16 +3264,16 @@ # GFX11: image_sample_c_d v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x85,0xf0,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x84,0xf0,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_c_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0x85,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x85,0xf0,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x84,0xf0,0x01,0x05,0x02,0x0c] @@ -3336,16 +3336,16 @@ # GFX11: image_sample_c_d_cl v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x11,0xf1,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x10,0xf1,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x11,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x11,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x10,0xf1,0x01,0x05,0x02,0x0c] @@ -3360,10 +3360,10 @@ # GFX11: image_sample_c_d_cl v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c] 0x04,0x03,0x11,0xf1,0xf9,0xfd,0x22,0x0c -# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c] 0x0c,0x03,0x12,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c] 0x0c,0x03,0x12,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x13,0xf1,0x01,0x05,0x22,0x0c] @@ -3384,10 +3384,10 @@ # GFX11: image_sample_c_d_cl v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c] 0x10,0x04,0x11,0xf1,0xfb,0xfe,0x22,0x0c -# GFX11: image_sample_c_d_cl v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c] 0x14,0x04,0x12,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_cl v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c] 0x14,0x04,0x12,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_cl v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x13,0xf1,0x01,0x05,0x38,0x64] @@ -3408,10 +3408,10 @@ # GFX11: image_sample_c_d_cl_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x51,0xf1,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x50,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x50,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d_cl_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x51,0xf1,0x01,0x05,0x02,0x0c] @@ -3480,22 +3480,22 @@ # GFX11: image_sample_c_d_cl_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c] 0x00,0x03,0x29,0xf1,0xfb,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[5:6], v[1:12], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[5:6], v[240:251], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x28,0xf1,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x29,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x29,0xf1,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c] 0x04,0x03,0x28,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v[254:255], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v[254:255], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c] 0x04,0x03,0x28,0xf1,0xf0,0xfe,0x02,0x0c # GFX11: image_sample_c_d_cl_o v[5:7], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0x01,0x05,0x22,0x0c] @@ -3504,10 +3504,10 @@ # GFX11: image_sample_c_d_cl_o v[253:255], v[248:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c] 0x04,0x03,0x29,0xf1,0xf8,0xfd,0x22,0x0c -# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c] 0x0c,0x03,0x2a,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c] 0x0c,0x03,0x2a,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x2b,0xf1,0x01,0x05,0x22,0x0c] @@ -3528,10 +3528,10 @@ # GFX11: image_sample_c_d_cl_o v[254:255], v[250:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c] 0x10,0x04,0x29,0xf1,0xfa,0xfe,0x22,0x0c -# GFX11: image_sample_c_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v5, v[1:10], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c] 0x14,0x04,0x2a,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o v255, v[240:249], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c] 0x14,0x04,0x2a,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_cl_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x2b,0xf1,0x01,0x05,0x38,0x64] @@ -3552,10 +3552,10 @@ # GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c] 0x00,0x03,0x59,0xf1,0xfb,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x58,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x58,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d_cl_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x59,0xf1,0x01,0x05,0x02,0x0c] @@ -3696,16 +3696,16 @@ # GFX11: image_sample_c_d_o v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c] 0x00,0x03,0xad,0xf0,0xfb,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0xac,0xf0,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0xad,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0xad,0xf0,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0xac,0xf0,0x01,0x05,0x02,0x0c] @@ -3720,10 +3720,10 @@ # GFX11: image_sample_c_d_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c] 0x04,0x03,0xad,0xf0,0xf9,0xfd,0x22,0x0c -# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c] 0x0c,0x03,0xae,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c] 0x0c,0x03,0xae,0xf0,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0xaf,0xf0,0x01,0x05,0x22,0x0c] @@ -3744,10 +3744,10 @@ # GFX11: image_sample_c_d_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c] 0x10,0x04,0xad,0xf0,0xfb,0xfe,0x22,0x0c -# GFX11: image_sample_c_d_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c] 0x14,0x04,0xae,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_c_d_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c] 0x14,0x04,0xae,0xf0,0xf0,0xff,0x02,0x0c # GFX11: image_sample_c_d_o v[5:6], v[1:8], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0xaf,0xf0,0x01,0x05,0x38,0x64] @@ -3768,10 +3768,10 @@ # GFX11: image_sample_c_d_o_g16 v[5:6], v[251:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c] 0x00,0x03,0xf1,0xf0,0xfb,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0xf0,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_c_d_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0xf0,0xf0,0xf0,0x05,0x02,0x0c # GFX11: image_sample_c_d_o_g16 v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0xf1,0xf0,0x01,0x05,0x02,0x0c] @@ -4344,10 +4344,10 @@ # GFX11: image_sample_d v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c] 0x00,0x03,0x71,0xf0,0xfd,0x05,0x02,0x0c -# GFX11: image_sample_d v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0x70,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x70,0xf0,0xf0,0x05,0x02,0x0c # GFX11: image_sample_d v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x71,0xf0,0x01,0x05,0x02,0x0c] @@ -4416,10 +4416,10 @@ # GFX11: image_sample_d_cl v[5:6], v[253:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c] 0x00,0x03,0x05,0xf1,0xfd,0x05,0x02,0x0c -# GFX11: image_sample_d_cl v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x04,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x04,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_d_cl v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x05,0xf1,0x01,0x05,0x02,0x0c] @@ -4560,16 +4560,16 @@ # GFX11: image_sample_d_cl_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x1d,0xf1,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v[5:6], v[1:11], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v[5:6], v[240:250], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x1c,0xf1,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x1d,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x1d,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_d_cl_o v[5:6], v[1:8], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x1c,0xf1,0x01,0x05,0x02,0x0c] @@ -4584,10 +4584,10 @@ # GFX11: image_sample_d_cl_o v[253:255], v[249:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D a16 tfe ; encoding: [0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c] 0x04,0x03,0x1d,0xf1,0xf9,0xfd,0x22,0x0c -# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c] 0x0c,0x03,0x1e,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE d16 ; encoding: [0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c] 0x0c,0x03,0x1e,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_CUBE a16 tfe d16 ; encoding: [0x0c,0x03,0x1f,0xf1,0x01,0x05,0x22,0x0c] @@ -4608,10 +4608,10 @@ # GFX11: image_sample_d_cl_o v[254:255], v[251:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_1D_ARRAY a16 tfe ; encoding: [0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c] 0x10,0x04,0x1d,0xf1,0xfb,0xfe,0x22,0x0c -# GFX11: image_sample_d_cl_o v5, v[1:16], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o v5, v[1:9], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c] 0x14,0x04,0x1e,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o v255, v[240:255], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c] +# GFX11: image_sample_d_cl_o v255, v[240:248], s[8:15], s[12:15] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY d16 ; encoding: [0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c] 0x14,0x04,0x1e,0xf1,0xf0,0xff,0x02,0x0c # GFX11: image_sample_d_cl_o v[5:6], v[1:7], s[96:103], s[100:103] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 tfe d16 ; encoding: [0x14,0x04,0x1f,0xf1,0x01,0x05,0x38,0x64] @@ -4632,10 +4632,10 @@ # GFX11: image_sample_d_cl_o_g16 v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x55,0xf1,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c] 0x08,0x03,0x54,0xf1,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_cl_o_g16 v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x54,0xf1,0xf0,0x05,0x02,0x0c # GFX11: image_sample_d_cl_o_g16 v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x55,0xf1,0x01,0x05,0x02,0x0c] @@ -4776,16 +4776,16 @@ # GFX11: image_sample_d_o v[5:6], v[252:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_1D a16 ; encoding: [0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c] 0x00,0x03,0x99,0xf0,0xfc,0x05,0x02,0x0c -# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_o v[5:6], v[1:10], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_o v[5:6], v[240:249], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x98,0xf0,0xf0,0x05,0x02,0x0c -# GFX11: image_sample_d_o v[5:6], v[1:16], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c] +# GFX11: image_sample_d_o v[5:6], v[1:9], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c] 0x08,0x03,0x99,0xf0,0x01,0x05,0x02,0x0c -# GFX11: image_sample_d_o v[5:6], v[240:255], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c] +# GFX11: image_sample_d_o v[5:6], v[240:248], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_3D a16 ; encoding: [0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c] 0x08,0x03,0x99,0xf0,0xf0,0x05,0x02,0x0c # GFX11: image_sample_d_o v[5:6], v[1:7], s[8:15], s[12:15] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x03,0x98,0xf0,0x01,0x05,0x02,0x0c] Index: llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt +++ llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt @@ -162,7 +162,7 @@ # GFX11: image_sample_d v[64:66], v[32:37], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] 0x04,0x07,0x70,0xf0,0x20,0x40,0x01,0x64 -# GFX11: image_sample_d v[64:66], v[32:47], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] +# GFX11: image_sample_d v[64:66], v[32:40], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64] 0x08,0x07,0x70,0xf0,0x20,0x40,0x01,0x64 # GFX11: image_sample_d v[64:66], [v32, v16, v8, v4], s[4:11], s[100:103] dmask:0x7 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x11,0x07,0x70,0xf0,0x20,0x40,0x01,0x64,0x10,0x08,0x04,0x00] @@ -282,16 +282,16 @@ # GFX11: image_msaa_load v[10:13], [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00] 0x1d,0x01,0x60,0xf0,0xcc,0x0a,0x0a,0x00,0x0b,0x0e,0x13,0x00 -# GFX11: image_bvh_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00] +# GFX11: image_bvh_intersect_ray v[4:7], v[9:19], s[4:7] ; encoding: [0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00] 0x80,0x8f,0x64,0xf0,0x09,0x04,0x01,0x00 # GFX11: image_bvh_intersect_ray v[4:7], v[9:16], s[4:7] a16 ; encoding: [0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00] 0x80,0x8f,0x65,0xf0,0x09,0x04,0x01,0x00 -# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00] +# GFX11: image_bvh64_intersect_ray v[4:7], v[9:20], s[4:7] ; encoding: [0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00] 0x80,0x8f,0x68,0xf0,0x09,0x04,0x01,0x00 -# GFX11: image_bvh64_intersect_ray v[4:7], v[9:24], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00] +# GFX11: image_bvh64_intersect_ray v[4:7], v[9:17], s[4:7] a16 ; encoding: [0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00] 0x80,0x8f,0x69,0xf0,0x09,0x04,0x01,0x00 # GFX11: image_bvh_intersect_ray v[39:42], [v50, v46, v[20:22], v[40:42], v[47:49]], s[12:15] ; encoding: [0x81,0x8f,0x64,0xf0,0x32,0x27,0x03,0x00,0x2e,0x14,0x28,0x2f]