diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -965,6 +965,16 @@ IsSGPR = false; IsAGPR = true; Width = 6; + } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { + IsSGPR = false; + Width = 7; + } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { + IsSGPR = true; + Width = 7; + } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { + IsSGPR = false; + IsAGPR = true; + Width = 7; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && "trap handler registers should not be used"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -78,6 +78,12 @@ setOperationAction(ISD::LOAD, MVT::v5f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::LOAD, MVT::v6f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::LOAD, MVT::v7f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); @@ -99,9 +105,15 @@ setOperationAction(ISD::LOAD, MVT::v2f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v3i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); + setOperationAction(ISD::LOAD, MVT::v4i64, Promote); AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v3f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); + setOperationAction(ISD::LOAD, MVT::v4f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); @@ -173,12 +185,14 @@ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); @@ -198,6 +212,12 @@ setOperationAction(ISD::STORE, MVT::v5f32, Promote); AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::STORE, MVT::v6f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::STORE, MVT::v7f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); + setOperationAction(ISD::STORE, MVT::v8f32, Promote); AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); @@ -219,6 +239,12 @@ setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v3i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); + + setOperationAction(ISD::STORE, MVT::v3f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); + setOperationAction(ISD::STORE, MVT::v4i64, Promote); AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); @@ -261,6 +287,11 @@ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); + setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); @@ -325,6 +356,10 @@ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); @@ -337,6 +372,10 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom); @@ -345,6 +384,8 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom); @@ -414,8 +455,7 @@ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32 - }; + MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32}; for (MVT VT : VectorIntTypes) { // Expand the following operations for the current type by default. @@ -456,8 +496,7 @@ } static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32 - }; + MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32}; for (MVT VT : FloatVectorTypes) { setOperationAction(ISD::FABS, VT, Expand); @@ -507,6 +546,12 @@ setOperationAction(ISD::SELECT, MVT::v5f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); + setOperationAction(ISD::SELECT, MVT::v6f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); + + setOperationAction(ISD::SELECT, MVT::v7f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); + // There are no libcalls of any kind. for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) setLibcallName(static_cast(I), nullptr); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -7,16 +7,16 @@ //===----------------------------------------------------------------------===// def SGPRRegBank : RegisterBank<"SGPR", - [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024] + [SReg_LO16, SReg_32, SReg_64, SReg_96, SReg_128, SReg_160, SReg_192, SReg_224, SReg_256, SReg_512, SReg_1024] >; def VGPRRegBank : RegisterBank<"VGPR", - [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024] + [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_224, VReg_256, VReg_512, VReg_1024] >; // It is helpful to distinguish conditions from ordinary SGPRs. def VCCRegBank : RegisterBank <"VCC", [SReg_1]>; def AGPRRegBank : RegisterBank <"AGPR", - [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024] + [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_224, AReg_256, AReg_512, AReg_1024] >; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2192,6 +2192,7 @@ case 4: return AMDGPU::VReg_128RegClassID; case 5: return AMDGPU::VReg_160RegClassID; case 6: return AMDGPU::VReg_192RegClassID; + case 7: return AMDGPU::VReg_224RegClassID; case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; case 32: return AMDGPU::VReg_1024RegClassID; @@ -2214,6 +2215,7 @@ case 4: return AMDGPU::SGPR_128RegClassID; case 5: return AMDGPU::SGPR_160RegClassID; case 6: return AMDGPU::SGPR_192RegClassID; + case 7: return AMDGPU::SGPR_224RegClassID; case 8: return AMDGPU::SGPR_256RegClassID; case 16: return AMDGPU::SGPR_512RegClassID; } @@ -2226,6 +2228,7 @@ case 4: return AMDGPU::AReg_128RegClassID; case 5: return AMDGPU::AReg_160RegClassID; case 6: return AMDGPU::AReg_192RegClassID; + case 7: return AMDGPU::AReg_224RegClassID; case 8: return AMDGPU::AReg_256RegClassID; case 16: return AMDGPU::AReg_512RegClassID; case 32: return AMDGPU::AReg_1024RegClassID; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -463,6 +463,7 @@ MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) Enc |= 512; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -102,6 +102,15 @@ addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); + addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); + addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); @@ -145,6 +154,8 @@ setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v5i32, Custom); + setOperationAction(ISD::LOAD, MVT::v6i32, Custom); + setOperationAction(ISD::LOAD, MVT::v7i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); @@ -154,6 +165,8 @@ setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v5i32, Custom); + setOperationAction(ISD::STORE, MVT::v6i32, Custom); + setOperationAction(ISD::STORE, MVT::v7i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -176,6 +189,8 @@ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); @@ -203,8 +218,16 @@ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); @@ -245,6 +268,7 @@ // with > 4 elements. for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { @@ -290,6 +314,20 @@ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } + for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); + } + for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); @@ -365,9 +403,13 @@ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); - // Deal with vec5 vector operations when widened to vec8. + // Deal with vec5/6/7 vector operations when widened to vec8. setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); @@ -11697,6 +11739,8 @@ return AMDGPU::VReg_160_Align2RegClassID; case AMDGPU::VReg_192RegClassID: return AMDGPU::VReg_192_Align2RegClassID; + case AMDGPU::VReg_224RegClassID: + return AMDGPU::VReg_224_Align2RegClassID; case AMDGPU::VReg_256RegClassID: return AMDGPU::VReg_256_Align2RegClassID; case AMDGPU::VReg_512RegClassID: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1340,6 +1340,8 @@ return AMDGPU::SI_SPILL_S160_SAVE; case 24: return AMDGPU::SI_SPILL_S192_SAVE; + case 28: + return AMDGPU::SI_SPILL_S224_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: @@ -1365,6 +1367,8 @@ return AMDGPU::SI_SPILL_V160_SAVE; case 24: return AMDGPU::SI_SPILL_V192_SAVE; + case 28: + return AMDGPU::SI_SPILL_V224_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: @@ -1473,6 +1477,8 @@ return AMDGPU::SI_SPILL_S160_RESTORE; case 24: return AMDGPU::SI_SPILL_S192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_S224_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: @@ -1498,6 +1504,8 @@ return AMDGPU::SI_SPILL_V160_RESTORE; case 24: return AMDGPU::SI_SPILL_V192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_V224_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -675,6 +675,7 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S160 : SI_SPILL_SGPR ; defm SI_SPILL_S192 : SI_SPILL_SGPR ; +defm SI_SPILL_S224 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; defm SI_SPILL_S1024 : SI_SPILL_SGPR ; @@ -718,6 +719,7 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR ; defm SI_SPILL_V160 : SI_SPILL_VGPR ; defm SI_SPILL_V192 : SI_SPILL_VGPR ; +defm SI_SPILL_V224 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; defm SI_SPILL_V1024 : SI_SPILL_VGPR ; @@ -728,6 +730,7 @@ defm SI_SPILL_A128 : SI_SPILL_VGPR ; defm SI_SPILL_A160 : SI_SPILL_VGPR ; defm SI_SPILL_A192 : SI_SPILL_VGPR ; +defm SI_SPILL_A224 : SI_SPILL_VGPR ; defm SI_SPILL_A256 : SI_SPILL_VGPR ; defm SI_SPILL_A512 : SI_SPILL_VGPR ; defm SI_SPILL_A1024 : SI_SPILL_VGPR ; @@ -1052,6 +1055,38 @@ >; } +foreach Index = 0-5 in { + def Extract_Element_v6i32_#Index : Extract_Element < + i32, v6i32, Index, !cast(sub#Index) + >; + def Insert_Element_v6i32_#Index : Insert_Element < + i32, v6i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v6f32_#Index : Extract_Element < + f32, v6f32, Index, !cast(sub#Index) + >; + def Insert_Element_v6f32_#Index : Insert_Element < + f32, v6f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-6 in { + def Extract_Element_v7i32_#Index : Extract_Element < + i32, v7i32, Index, !cast(sub#Index) + >; + def Insert_Element_v7i32_#Index : Insert_Element < + i32, v7i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v7f32_#Index : Extract_Element < + f32, v7f32, Index, !cast(sub#Index) + >; + def Insert_Element_v7f32_#Index : Insert_Element < + f32, v7f32, Index, !cast(sub#Index) + >; +} + foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast(sub#Index) @@ -1202,8 +1237,32 @@ def : BitConvert ; // 160-bit bitcast -def : BitConvert ; -def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// 192-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// 224-bit bitcast +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 256-bit bitcast def : BitConvert ; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1874,6 +1874,8 @@ return &AMDGPU::VReg_160RegClass; if (BitWidth <= 192) return &AMDGPU::VReg_192RegClass; + if (BitWidth <= 224) + return &AMDGPU::VReg_224RegClass; if (BitWidth <= 256) return &AMDGPU::VReg_256RegClass; if (BitWidth <= 512) @@ -1896,6 +1898,8 @@ return &AMDGPU::VReg_160_Align2RegClass; if (BitWidth <= 192) return &AMDGPU::VReg_192_Align2RegClass; + if (BitWidth <= 224) + return &AMDGPU::VReg_224_Align2RegClass; if (BitWidth <= 256) return &AMDGPU::VReg_256_Align2RegClass; if (BitWidth <= 512) @@ -2036,6 +2040,11 @@ &AMDGPU::SReg_192RegClass, &AMDGPU::AReg_192_Align2RegClass, &AMDGPU::AReg_192RegClass, + &AMDGPU::VReg_224_Align2RegClass, + &AMDGPU::VReg_224RegClass, + &AMDGPU::SReg_224RegClass, + &AMDGPU::AReg_224_Align2RegClass, + &AMDGPU::AReg_224RegClass, &AMDGPU::VReg_256_Align2RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -58,6 +58,7 @@ list ret4 = [sub0, sub1, sub2, sub3]; list ret5 = [sub0, sub1, sub2, sub3, sub4]; list ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; + list ret7 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6]; list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, @@ -77,9 +78,10 @@ !if(!eq(size, 4), ret4, !if(!eq(size, 5), ret5, !if(!eq(size, 6), ret6, - !if(!eq(size, 8), ret8, - !if(!eq(size, 16), ret16, - ret32))))))); + !if(!eq(size, 7), ret7, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, + ret32)))))))); } // Generates list of sequential register tuple names. @@ -350,9 +352,12 @@ // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. def SGPR_160Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 5, "s">; -// SGPR 192-bit registers +// SGPR 192-bit registers. No operations use these, but for symmetry with 192-bit VGPRs. def SGPR_192Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 6, "s">; +// SGPR 224-bit registers. No operations use these, but for symmetry with 224-bit VGPRs. +def SGPR_224Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 7, "s">; + // SGPR 256-bit registers def SGPR_256Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 8, "s">; @@ -508,6 +513,9 @@ // VGPR 192-bit registers def VGPR_192 : SIRegisterTuples.ret, VGPR_32, 255, 1, 6, "v">; +// VGPR 224-bit registers +def VGPR_224 : SIRegisterTuples.ret, VGPR_32, 255, 1, 7, "v">; + // VGPR 256-bit registers def VGPR_256 : SIRegisterTuples.ret, VGPR_32, 255, 1, 8, "v">; @@ -547,6 +555,9 @@ // AGPR 192-bit registers def AGPR_192 : SIRegisterTuples.ret, AGPR_32, 255, 1, 6, "a">; +// AGPR 224-bit registers +def AGPR_224 : SIRegisterTuples.ret, AGPR_32, 255, 1, 7, "a">; + // AGPR 256-bit registers def AGPR_256 : SIRegisterTuples.ret, AGPR_32, 255, 1, 8, "a">; @@ -725,20 +736,41 @@ (add SGPR_160)> { // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated // subclasses of SGPR_160 to be marked unallocatable too. + // This occurs because SGPR_160 and SReg_160 classes are equivalent in size + // meaning their enumeration order is dependent on alphanumeric ordering of + // their names. The superclass for inherence is the last one in topological + // order (i.e. enumeration order), hence SReg_160 is selected. + // Potential workarounds involve renaming SGPR_160, adding another class + // which is ordered last and hence used for inheritance, or adding more + // registers to SReg_160 to cause it to be moved earlier in the superclass + // list. + let CopyCost = 3; +} + +// There are no 6-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192Regs)> { + let AllocationPriority = 17; } -def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> { - let Size = 192; - let AllocationPriority = 17; +def SReg_192 : RegisterClass<"AMDGPU", [v6i32, v6f32, v3i64, v3f64], 32, (add SGPR_192)> { + let isAllocatable = 0; + let CopyCost = 3; } -def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> { - let Size = 192; +// There are no 7-component scalar instructions, but this is needed +// for symmetry with VGPRs. +def SGPR_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224Regs)> { + let AllocationPriority = 18; +} + +def SReg_224 : RegisterClass<"AMDGPU", [v7i32, v7f32], 32, (add SGPR_224)> { let isAllocatable = 0; + let CopyCost = 4; } def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> { - let AllocationPriority = 18; + let AllocationPriority = 19; } def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> { @@ -754,7 +786,7 @@ def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, (add SGPR_512Regs)> { - let AllocationPriority = 19; + let AllocationPriority = 20; } def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32, @@ -776,7 +808,7 @@ def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, (add SGPR_1024Regs)> { - let AllocationPriority = 20; + let AllocationPriority = 21; } def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, @@ -812,7 +844,8 @@ defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; -defm VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; +defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; +defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>; defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; @@ -832,7 +865,8 @@ defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; -defm AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; +defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; +defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; defm AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; defm AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; defm AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1650,6 +1650,13 @@ case AMDGPU::VReg_192_Align2RegClassID: case AMDGPU::AReg_192_Align2RegClassID: return 192; + case AMDGPU::SGPR_224RegClassID: + case AMDGPU::SReg_224RegClassID: + case AMDGPU::VReg_224RegClassID: + case AMDGPU::AReg_224RegClassID: + case AMDGPU::VReg_224_Align2RegClassID: + case AMDGPU::AReg_224_Align2RegClassID: + return 224; case AMDGPU::SGPR_256RegClassID: case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -3501,13 +3501,13 @@ ; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s7 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc @@ -3526,7 +3526,6 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v12, s5 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc -; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 @@ -3535,13 +3534,13 @@ ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v14, s7 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: v_mov_b32_e32 v13, s6 ; MOVREL-NEXT: v_mov_b32_e32 v7, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 ; MOVREL-NEXT: v_mov_b32_e32 v8, s1 @@ -3551,7 +3550,6 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 ; MOVREL-NEXT: v_mov_b32_e32 v12, s5 -; MOVREL-NEXT: v_mov_b32_e32 v13, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo @@ -3574,13 +3572,13 @@ ; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc @@ -3600,7 +3598,6 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 ; GPRIDX-NEXT: v_mov_b32_e32 v1, v7 @@ -3609,13 +3606,13 @@ ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; MOVREL-NEXT: v_mov_b32_e32 v9, s1 @@ -3625,7 +3622,6 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -15,7 +15,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel @@ -33,7 +33,7 @@ ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3 -; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8 +; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1081,32 +1081,31 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v12, v[4:5] -; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 1, v0 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v10, v[4:5] +; VI-NEXT: flat_load_ubyte v11, v[6:7] ; VI-NEXT: flat_load_ubyte v8, v[8:9] -; VI-NEXT: flat_load_ubyte v9, v[10:11] +; VI-NEXT: v_add_u32_e32 v4, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v11 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 @@ -1127,25 +1126,23 @@ ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:2 ; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 ; GFX10-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v6, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 +; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 ; GFX10-NEXT: global_load_ubyte v7, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(4) ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 -; GFX10-NEXT: global_store_dword v8, v6, s[0:1] offset:24 -; GFX10-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -287,7 +287,7 @@ ; GCN-LABEL: {{^}}v3i64_func_void: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx2 v[4:5], off ; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i64> @v3i64_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -15,7 +15,7 @@ ; CHECK: .max_flat_workgroup_size: 1024 ; CHECK: .name: test ; CHECK: .private_segment_fixed_size: 0 -; CHECK: .sgpr_count: 8 +; CHECK: .sgpr_count: 6 ; CHECK: .symbol: test.kd ; CHECK: .vgpr_count: {{3|6}} ; WAVE64: .wavefront_size: 64 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -16,7 +16,7 @@ ; CHECK: PrivateSegmentFixedSize: 0 ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 -; CHECK: NumSGPRs: 8 +; CHECK: NumSGPRs: 6 ; CHECK: NumVGPRs: {{3|6}} ; CHECK: MaxFlatWorkGroupSize: 1024 define amdgpu_kernel void @test( @@ -39,7 +39,7 @@ ; CHECK: PrivateSegmentFixedSize: 0 ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 -; CHECK: NumSGPRs: 8 +; CHECK: NumSGPRs: 6 ; CHECK: NumVGPRs: {{3|6}} ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test_max_flat_workgroup_size( diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1506,26 +1506,27 @@ ; SI-LABEL: dynamic_insertelement_v3i64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; SI-NEXT: s_load_dword s6, s[4:5], 0x10 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc +; SI-NEXT: s_load_dword s12, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 -; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s10 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] +; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2 +; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1533,26 +1534,27 @@ ; VI-LABEL: dynamic_insertelement_v3i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 -; VI-NEXT: s_load_dword s6, s[4:5], 0x40 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30 +; VI-NEXT: s_load_dword s12, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 -; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2 +; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] ; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1039,10 +1039,10 @@ ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s2, s4, 4 @@ -1057,7 +1057,7 @@ ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -1076,7 +1076,7 @@ ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -1169,10 +1169,10 @@ ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1187,7 +1187,7 @@ ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 @@ -1214,7 +1214,7 @@ ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll --- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll @@ -1,19 +1,19 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s ; Make sure the expected regmask is generated for sub/superregisters. -; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} +; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} define void @csr() #0 { call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0 ret void } -; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @subregs_for_super() #0 { call void asm sideeffect "", "~{v0},~{v1}"() #0 ret void } -; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @clobbered_reg_with_sub() #0 { call void asm sideeffect "", "~{v[0:1]}"() #0 ret void diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -25,7 +25,8 @@ } ; FUNC-LABEL: {{^}}constant_load_v3i64: -; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x{{[0-9]+}}{{$}} ; EG-DAG: VTX_READ_128 ; EG-DAG: VTX_READ_128 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll @@ -25,10 +25,10 @@ } ; FUNC-LABEL: {{^}}global_load_v3f64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-NOHSA-DAG: buffer_load_dwordx4 +; GCN-NOHSA-DAG: buffer_load_dwordx2 +; GCN-HSA-DAG: flat_load_dwordx4 +; GCN-HSA-DAG: flat_load_dwordx2 define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll @@ -32,11 +32,11 @@ } ; FUNC-LABEL: {{^}}global_load_v3i64: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA-DAG: buffer_load_dwordx4 +; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA: flat_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA-DAG: flat_load_dwordx4 +; GCN-HSA-DAG: flat_load_dwordx2 ; EG: VTX_READ_128 ; EG: VTX_READ_128 diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -66,38 +66,38 @@ ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s5, s3 ; CHECK-NEXT: s_mov_b32 s4, s2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1 ; CHECK-NEXT: v_mov_b32_e32 v9, s0 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 ; CHECK-NEXT: ds_read_b32 v3, v1 -; CHECK-NEXT: ds_read_b32 v5, v4 -; CHECK-NEXT: ds_read_b32 v4, v7 -; CHECK-NEXT: ds_read_b32 v1, v8 +; CHECK-NEXT: ds_read_b32 v4, v0 +; CHECK-NEXT: ds_read_b32 v5, v5 ; CHECK-NEXT: ds_read_b32 v6, v6 -; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2 +; CHECK-NEXT: ds_read_b32 v0, v7 +; CHECK-NEXT: ds_read_b32 v1, v8 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: ds_read_b32 v0, v2 -; CHECK-NEXT: ds_read_b32 v2, v12 -; CHECK-NEXT: ds_read_b32 v1, v7 +; CHECK-NEXT: ds_read_b32 v1, v10 +; CHECK-NEXT: ds_read_b32 v2, v11 +; CHECK-NEXT: ds_read_b32 v3, v12 +; CHECK-NEXT: ds_read_b32 v4, v7 ; CHECK-NEXT: ds_read_b32 v5, v8 -; CHECK-NEXT: ds_read_b32 v3, v11 -; CHECK-NEXT: ds_read_b32 v4, v10 ; CHECK-NEXT: s_waitcnt lgkmcnt(5) ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -499,7 +499,7 @@ ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +529,7 @@ ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -671,7 +671,7 @@ ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -701,7 +701,7 @@ ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -738,7 +738,7 @@ ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -480,7 +480,7 @@ ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -551,7 +551,7 @@ ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -583,7 +583,7 @@ ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -708,7 +708,7 @@ ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -747,7 +747,7 @@ ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1297,37 +1297,37 @@ ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: fma_shuffle: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] -; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()