Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -279,6 +279,11 @@ setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -157,6 +157,9 @@ addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); + addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); @@ -168,10 +171,8 @@ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } - if (Subtarget->hasMAIInsts()) { - addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); - } + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -243,6 +244,8 @@ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -280,7 +283,7 @@ for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v32i32, MVT::v32f32 }) { + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -352,6 +355,20 @@ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); } + for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); + } + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -3917,12 +3934,14 @@ case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: + case AMDGPU::SI_INDIRECT_SRC_V32: return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: + case AMDGPU::SI_INDIRECT_DST_V32: return emitIndirectDst(MI, *BB, *getSubtarget()); case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: case AMDGPU::SI_KILL_I1_PSEUDO: Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -565,12 +565,14 @@ def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC; +def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST; } // End Uses = [EXEC], Defs = [M0, EXEC] @@ -1192,8 +1194,8 @@ // 512-bit bitcast def : BitConvert ; def : BitConvert ; -def : BitConvert ; -def : BitConvert ; +def : BitConvert ; +def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; @@ -1206,6 +1208,17 @@ // 1024-bit bitcast def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + /********** =================== **********/ /********** Src & Dst modifiers **********/ @@ -1581,11 +1594,13 @@ defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; //===----------------------------------------------------------------------===// // SAD Patterns Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -773,12 +773,12 @@ let isAllocatable = 0; } -def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, +def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, (add SGPR_1024Regs)> { let AllocationPriority = 20; } -def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, +def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32, (add SGPR_1024)> { let CopyCost = 16; let isAllocatable = 0; @@ -803,7 +803,7 @@ def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>; def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>; def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>; -def VReg_1024 : VRegClass<32, [v32i32, v32f32], (add VGPR_1024)>; +def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; class ARegClass regTypes, dag regList> : VRegClass { @@ -819,7 +819,7 @@ def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>; def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>; def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>; -def AReg_1024 : ARegClass<32, [v32i32, v32f32], (add AGPR_1024)>; +def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>; } // End GeneratePressureSet = 0 Index: llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -220,42 +220,31 @@ ret void } +; GCN-LABEL: {{^}}double15_extelt: +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] +define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <15 x double> , i32 %sel + store double %ext, double addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}double16_extelt: -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: store_dword +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]] +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]] +; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]] +; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]] define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) { entry: %ext = extractelement <16 x double> , i32 %sel @@ -263,6 +252,50 @@ ret void } +; GCN-LABEL: {{^}}float32_extelt: +; GCN-NOT: buffer_ +; GCN-DAG: s_mov_b32 m0, +; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41880000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41980000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f00000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f80000 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000 +; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]] +; GCN: store_dword v[{{[0-9:]+}}], [[RES]] +define amdgpu_kernel void @float32_extelt(float addrspace(1)* %out, i32 %sel) { +entry: + %ext = extractelement <32 x float> , i32 %sel + store float %ext, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}byte8_extelt: ; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201 Index: llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -106,6 +106,15 @@ ret void } +; GCN-LABEL: {{^}}float32_inselt: +; GCN: v_movreld_b32 +define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) { +entry: + %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel + store <32 x float> %v, <32 x float> addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}half4_inselt: ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_movrel @@ -298,6 +307,36 @@ ret void } +; GCN-LABEL: {{^}}double16_inselt: +; GCN-NOT: v_cndmask +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: v_movreld_b32_e32 v[[#BASE+1]], +define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) { +entry: + %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel + store <16 x double> %v, <16 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double15_inselt: +; GCN-NOT: v_cndmask +; GCN-NOT: buffer_ +; GCN-NOT: s_or_b32 +; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]] +; GCN-DAG: v_movreld_b32_e32 v[[#BASE:]], 0 +; GCN-NOT: s_mov_b32 m0 +; GCN: v_movreld_b32_e32 v[[#BASE+1]], +define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) { +entry: + %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel + store <15 x double> %v, <15 x double> addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}bit4_inselt: ; GCN: buffer_store_byte ; GCN: buffer_load_ubyte Index: llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 644 +; GFX900: ScratchSize: 2052 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256