diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1289,6 +1289,8 @@ return &AMDGPU::VReg_128RegClass; case 160: return &AMDGPU::VReg_160RegClass; + case 192: + return &AMDGPU::VReg_192RegClass; case 256: return &AMDGPU::VReg_256RegClass; case 512: @@ -1331,6 +1333,8 @@ return &AMDGPU::SReg_128RegClass; case 160: return &AMDGPU::SReg_160RegClass; + case 192: + return &AMDGPU::SReg_192RegClass; case 256: return &AMDGPU::SReg_256RegClass; case 512: @@ -1362,6 +1366,8 @@ &AMDGPU::AReg_128RegClass, &AMDGPU::VReg_160RegClass, &AMDGPU::SReg_160RegClass, + &AMDGPU::VReg_192RegClass, + &AMDGPU::SReg_192RegClass, &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, @@ -1535,6 +1541,11 @@ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, }; + static const int16_t Sub0_5[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, + }; + static const int16_t Sub0_4[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, }; @@ -1562,6 +1573,8 @@ return makeArrayRef(Sub0_3); case 160: return makeArrayRef(Sub0_4); + case 192: + return makeArrayRef(Sub0_5); case 256: return makeArrayRef(Sub0_7); case 512: @@ -1597,6 +1610,9 @@ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7 }; + static const int16_t Sub0_5_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5 + }; static const int16_t Sub0_3_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3 @@ -1607,6 +1623,8 @@ return {}; case 128: return makeArrayRef(Sub0_3_64); + case 192: + return makeArrayRef(Sub0_5_64); case 256: return makeArrayRef(Sub0_7_64); case 512: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -59,6 +59,7 @@ list ret3 = [sub0, sub1, sub2]; list ret4 = [sub0, sub1, sub2, sub3]; list ret5 = [sub0, sub1, sub2, sub3, sub4]; + list ret6 = [sub0, sub1, sub2, sub3, sub4, sub5]; list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; list ret16 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, @@ -77,8 +78,10 @@ !if(!eq(size, 3), ret3, !if(!eq(size, 4), ret4, !if(!eq(size, 5), ret5, - !if(!eq(size, 8), ret8, - !if(!eq(size, 16), ret16, ret32)))))); + !if(!eq(size, 6), ret6, + !if(!eq(size, 8), ret8, + !if(!eq(size, 16), ret16, + ret32))))))); } // Generates list of sequential register tuple names. @@ -338,6 +341,9 @@ // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs. def SGPR_160Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 5, "s">; +// SGPR 192-bit registers +def SGPR_192Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 6, "s">; + // SGPR 256-bit registers def SGPR_256Regs : SIRegisterTuples.ret, SGPR_32, 105, 4, 8, "s">; @@ -484,6 +490,9 @@ // VGPR 160-bit registers def VGPR_160 : SIRegisterTuples.ret, VGPR_32, 255, 1, 5, "v">; +// VGPR 192-bit registers +def VGPR_192 : SIRegisterTuples.ret, VGPR_32, 255, 1, 6, "v">; + // VGPR 256-bit registers def VGPR_256 : SIRegisterTuples.ret, VGPR_32, 255, 1, 8, "v">; @@ -655,10 +664,15 @@ let AllocationPriority = 16; } -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64], 32, (add SGPR_256Regs)> { +def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> { + let Size = 192; let AllocationPriority = 17; } +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64], 32, (add SGPR_256Regs)> { + let AllocationPriority = 18; +} + def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { let isAllocatable = 0; } @@ -667,12 +681,12 @@ (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; - let AllocationPriority = 17; + let AllocationPriority = 18; } def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { - let AllocationPriority = 18; + let AllocationPriority = 19; } def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, @@ -684,7 +698,7 @@ (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; - let AllocationPriority = 18; + let AllocationPriority = 19; } def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -694,13 +708,13 @@ def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024Regs)> { - let AllocationPriority = 19; + let AllocationPriority = 20; } def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add SGPR_1024)> { let CopyCost = 16; - let AllocationPriority = 19; + let AllocationPriority = 20; } // Register class for all vector registers (VGPRs + Interploation Registers) @@ -743,11 +757,18 @@ let Weight = 5; } +def VReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_192)> { + let Size = 192; + let CopyCost = 6; + let AllocationPriority = 6; + let Weight = 6; +} + def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let Size = 256; let CopyCost = 8; - let AllocationPriority = 6; + let AllocationPriority = 7; let Weight = 8; } @@ -755,7 +776,7 @@ (add VGPR_512)> { let Size = 512; let CopyCost = 16; - let AllocationPriority = 7; + let AllocationPriority = 8; let Weight = 16; } @@ -763,7 +784,7 @@ (add VGPR_1024)> { let Size = 1024; let CopyCost = 32; - let AllocationPriority = 8; + let AllocationPriority = 9; let Weight = 32; } @@ -788,14 +809,14 @@ (add AGPR_512)> { let Size = 512; let CopyCost = 33; - let AllocationPriority = 7; + let AllocationPriority = 8; } def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32, (add AGPR_1024)> { let Size = 1024; let CopyCost = 65; - let AllocationPriority = 8; + let AllocationPriority = 9; } } // End GeneratePressureSet = 0 diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1114,6 +1114,9 @@ case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: return 160; + case AMDGPU::SReg_192RegClassID: + case AMDGPU::VReg_192RegClassID: + return 192; case AMDGPU::SReg_256RegClassID: case AMDGPU::VReg_256RegClassID: return 256; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte-xfail.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte-xfail.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte-xfail.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: not --crash llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s - -define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { - %trunc = trunc i32 %arg0 to i24 - %val = bitcast i24 %trunc to <3 x i8> - %cvt = uitofp <3 x i8> %val to <3 x float> - ret <3 x float> %cvt -} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -191,6 +191,40 @@ ret <2 x float> %cvt } +define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { +; SI-LABEL: v_uitofp_v3i8_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_movk_i32 s4, 0xff +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_uitofp_v3i8_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_movk_i32 s4, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_and_b32_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i32 %arg0 to i24 + %val = bitcast i24 %trunc to <3 x i8> + %cvt = uitofp <3 x i8> %val to <3 x float> + ret <3 x float> %cvt +} + define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { ; SI-LABEL: v_uitofp_v4i8_to_v4f32: ; SI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-concat-vectors.mir @@ -3,9 +3,7 @@ # RUN: FileCheck -check-prefix=ERR %s < %t # ERR-NOT: remark: -# ERR: remark: :0:0: cannot select: %3:sgpr(<12 x s16>) = G_CONCAT_VECTORS %0:sgpr(<4 x s16>), %1:sgpr(<4 x s16>), %2:sgpr(<4 x s16>) (in function: test_concat_vectors_s_v12s16_s_v4s16_s_v4s16_s_v4s16) -# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(<12 x s16>) = G_CONCAT_VECTORS %0:vgpr(<4 x s16>), %1:vgpr(<4 x s16>), %2:vgpr(<4 x s16>) (in function: test_concat_vectors_v_v12s16_v_v4s16_v_v4s16_v_v4s16) -# ERR-NEXT: remark: :0:0: cannot select: %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0:sgpr(<3 x s64>), %1:sgpr(<3 x s64>) (in function: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64) +# ERR: remark: :0:0: cannot select: %2:sgpr(<6 x s64>) = G_CONCAT_VECTORS %0:sgpr(<3 x s64>), %1:sgpr(<3 x s64>) (in function: test_concat_vectors_s_v6s64_s_v3s64_s_v3s64) # ERR-NOT: remark: --- @@ -282,11 +280,11 @@ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN-LABEL: name: test_concat_vectors_s_v12s16_s_v4s16_s_v4s16_s_v4s16 - ; GCN: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 - ; GCN: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr2_sgpr3 - ; GCN: [[COPY2:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr4_sgpr5 - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<4 x s16>) - ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 %1:sgpr(<4 x s16>) = COPY $sgpr2_sgpr3 %2:sgpr(<4 x s16>) = COPY $sgpr4_sgpr5 @@ -304,11 +302,11 @@ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GCN-LABEL: name: test_concat_vectors_v_v12s16_v_v4s16_v_v4s16_v_v4s16 - ; GCN: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GCN: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GCN: [[COPY2:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr4_vgpr5 - ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<12 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<4 x s16>) - ; GCN: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>) + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 %2:vgpr(<4 x s16>) = COPY $vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-merge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-merge-values.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-merge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-merge-values.mir @@ -1,12 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s 2> %t | FileCheck -check-prefix=GCN %s -# RUN: FileCheck -check-prefix=ERR %s < %t - - -# ERR-NOT: remark: -# ERR: remark: :0:0: cannot select: %3:sgpr(s192) = G_MERGE_VALUES %0:sgpr(s64), %1:sgpr(s64), %2:sgpr(s64) (in function: test_merge_values_s_s192_s_s64_s_s64_s_s64) -# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s192) = G_MERGE_VALUES %0:vgpr(s64), %1:vgpr(s64), %2:vgpr(s64) (in function: test_merge_values_v_s192_v_s64_v_s64_v_s64) -# ERR-NOT: remark: +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s | FileCheck -check-prefix=GCN %s --- name: test_merge_values_v_s64_v_s32_v_s32 @@ -346,11 +339,11 @@ ; GCN-LABEL: name: test_merge_values_s_s192_s_s64_s_s64_s_s64 ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 - ; GCN: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; GCN: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3 - ; GCN: [[COPY2:%[0-9]+]]:sgpr(s64) = COPY $sgpr4_sgpr5 - ; GCN: [[MV:%[0-9]+]]:sgpr(s192) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64), [[COPY2]](s64) - ; GCN: S_ENDPGM 0, implicit [[MV]](s192) + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 + ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:sgpr(s64) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = COPY $sgpr2_sgpr3 %2:sgpr(s64) = COPY $sgpr4_sgpr5 @@ -370,11 +363,11 @@ ; GCN-LABEL: name: test_merge_values_v_s192_v_s64_v_s64_v_s64 ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 - ; GCN: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GCN: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GCN: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY $vgpr4_vgpr5 - ; GCN: [[MV:%[0-9]+]]:vgpr(s192) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64), [[COPY2]](s64) - ; GCN: S_ENDPGM 0, implicit [[MV]](s192) + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5 + ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-unmerge-values.mir @@ -1,10 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s 2> %t | FileCheck -check-prefix=GCN %s -# RUN: FileCheck -check-prefix=ERR %s < %t - -# ERR-NOT: remark: -# ERR: remark: :0:0: cannot select: %1:sgpr(s64), %2:sgpr(s64), %3:sgpr(s64) = G_UNMERGE_VALUES %0:sgpr(s192) (in function: test_unmerge_values_s_s64_s_s64_s64_s_s192) -# ERR-NOT: remark: +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -o - %s | FileCheck -check-prefix=GCN %s --- name: test_unmerge_values_v_s32_v_s32_v_s64 @@ -185,9 +180,11 @@ ; GCN-LABEL: name: test_unmerge_values_s_s64_s_s64_s64_s_s192 ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: [[DEF:%[0-9]+]]:sgpr(s192) = G_IMPLICIT_DEF - ; GCN: [[UV:%[0-9]+]]:sgpr(s64), [[UV1:%[0-9]+]]:sgpr(s64), [[UV2:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[DEF]](s192) - ; GCN: S_ENDPGM 0, implicit [[UV]](s64), implicit [[UV1]](s64), implicit [[UV2]](s64) + ; GCN: [[DEF:%[0-9]+]]:sreg_192 = IMPLICIT_DEF + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY [[DEF]].sub0_sub1 + ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[DEF]].sub2_sub3 + ; GCN: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[DEF]].sub4_sub5 + ; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[COPY1]], implicit [[COPY2]] %0:sgpr(s192) = G_IMPLICIT_DEF %1:sgpr(s64), %2:sgpr(s64), %3:sgpr(s64) = G_UNMERGE_VALUES %0 S_ENDPGM 0, implicit %1, implicit %2, implicit %3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload-xfail.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload-xfail.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload-xfail.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s - -define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) { - %load = load i32, i32 addrspace(1)* %ptr - %ext = zext i32 %load to i96 - ret i96 %ext -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -134,6 +134,50 @@ ret i64 %ext } +define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) { +; GFX9-LABEL: zextload_global_i32_to_i96: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: global_load_dword v0, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: zextload_global_i32_to_i96: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: zextload_global_i32_to_i96: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dword v0, v[2:3], s[4:7], 0 addr64 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load i32, i32 addrspace(1)* %ptr + %ext = zext i32 %load to i96 + ret i96 %ext +} + define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) { ; GFX9-LABEL: zextload_global_i32_to_i128: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll --- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll @@ -1,19 +1,19 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s ; Make sure the expected regmask is generated for sub/superregisters. -; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} +; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}} define void @csr() #0 { call void asm sideeffect "", "~{v0},~{v36},~{v37}"() #0 ret void } -; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: subregs_for_super Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @subregs_for_super() #0 { call void asm sideeffect "", "~{v0},~{v1}"() #0 ret void } -; CHECK-DAG: Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} +; CHECK-DAG: clobbered_reg_with_sub Clobbered Registers: $vgpr0 $vgpr1 $vgpr0_hi16 $vgpr1_hi16 $vgpr0_lo16 $vgpr1_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 $vgpr0_vgpr1 $vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 $vgpr1_vgpr2_vgpr3 {{$}} define void @clobbered_reg_with_sub() #0 { call void asm sideeffect "", "~{v[0:1]}"() #0 ret void