Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -465,11 +465,23 @@ if (!SrcRC) return false; - ArrayRef SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); + const unsigned RoundedDstSize = 32 * ((DstSize + 31) / 32); const DebugLoc &DL = I.getDebugLoc(); - MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) - .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); + + MachineInstr *Copy; + + // If we are doing an extract where the result size rounds up to the source, + // this isn't a subregister copy (e.g. v3s16 from v4s16). + if (RoundedDstSize == SrcSize) { + Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(SrcReg); + } else { + ArrayRef SubRegs = TRI.getRegSplitParts(SrcRC, RoundedDstSize / 8); + + Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(SrcReg, 0, SubRegs[Offset / RoundedDstSize]); + } for (const MachineOperand &MO : Copy->operands()) { const TargetRegisterClass *RC = @@ -592,6 +604,8 @@ unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); unsigned InsSize = Src1Ty.getSizeInBits(); + if (InsSize % 32 != 0) + return false; int64_t Offset = I.getOperand(3).getImm(); if (Offset % 32 != 0) Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1782,6 +1782,7 @@ case 32: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : &AMDGPU::SReg_32RegClass; + case 48: case 64: return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : &AMDGPU::SReg_64RegClass; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir @@ -177,3 +177,83 @@ S_ENDPGM 0, implicit %1, implicit %2 ... + +--- +name: extract_sgpr_s16_from_v4s16_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: extract_sgpr_s16_from_v4s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; CHECK: S_ENDPGM 0, implicit [[COPY1]] + %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 + %1:sgpr(s16) = G_EXTRACT %0, 0 + S_ENDPGM 0, implicit %1 + +... + +--- +name: extract_sgpr_v2s16_from_v4s16_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: extract_sgpr_v2s16_from_v4s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; CHECK: S_ENDPGM 0, implicit [[COPY1]] + %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 + %1:sgpr(<2 x s16>) = G_EXTRACT %0, 0 + S_ENDPGM 0, implicit %1 + +... + +--- +name: extract_sgpr_v3s16_from_v4s16_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: extract_sgpr_v3s16_from_v4s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 + %1:sgpr(<3 x s16>) = G_EXTRACT %0, 0 + S_ENDPGM 0, implicit %1 + +... + +# FIXME: +# --- +# name: extract_sgpr_s16_from_v3s16_offset0 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# %0:sgpr(<3 x s16>) = IMPLICIT_DEF +# %1:sgpr(s16) = G_EXTRACT %0, 0 +# S_ENDPGM 0, implicit %1 + +# ... + +# --- +# name: extract_sgpr_v2s16_from_v3s16_offset0 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# %0:sgpr(<3 x s16>) = IMPLICIT_DEF +# %1:sgpr(<2 x s16>) = G_EXTRACT %0, 0 +# S_ENDPGM 0, implicit %1 + +# ... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir @@ -560,3 +560,54 @@ %2:vgpr(s256) = G_INSERT %0, %1, 128 S_ENDPGM 0, implicit %2 ... + +# --- +# name: insert_sgpr_v3s16_to_v4s16_offset0 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# liveins: $sgpr0_sgpr1 +# %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 +# %1:sgpr(<3 x s16>) = IMPLICIT_DEF +# %2:sgpr(<4 x s16>) = G_INSERT %0, %1, 0 +# S_ENDPGM 0, implicit %2 + +# ... + +--- +name: insert_sgpr_v2s16_to_v4s16_offset0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: insert_sgpr_v2s16_to_v4s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 + %1:sgpr(<2 x s16>) = COPY $sgpr2 + %2:sgpr(<4 x s16>) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 + +... + +# --- +# name: insert_sgpr_s16_to_v4s16_offset0 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# liveins: $sgpr0_sgpr1, $sgpr2 +# %0:sgpr(<4 x s16>) = COPY $sgpr0_sgpr1 +# %1:sgpr(s32) = COPY $sgpr2 +# %2:sgpr(s16) = G_TRUNC %1 +# %3:sgpr(<4 x s16>) = G_INSERT %0, %2, 0 +# S_ENDPGM 0, implicit %3 + +# ...