Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -469,6 +469,7 @@ return true; } +// TODO: We should probably legalize these to only using 32-bit results. bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); Register DstReg = I.getOperand(0).getReg(); @@ -480,7 +481,12 @@ // TODO: Should handle any multiple of 32 offset. unsigned Offset = I.getOperand(2).getImm(); - if (Offset % DstSize != 0) + if (Offset % 32 != 0 || DstSize > 128) + return false; + + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); + if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) return false; const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -488,20 +494,18 @@ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI); if (!SrcRC) return false; + unsigned SubReg = AMDGPURegisterInfo::getSubRegFromChannel(Offset / 32, + DstSize / 32); + SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); + if (!SrcRC) + return false; - ArrayRef SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); - + SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, + *SrcRC, I.getOperand(1)); const DebugLoc &DL = I.getDebugLoc(); - MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) - .addReg(SrcReg, 0, SubRegs[Offset / DstSize]); + BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) + .addReg(SrcReg, 0, SubReg); - for (const MachineOperand &MO : Copy->operands()) { - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, *MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); - } I.eraseFromParent(); return true; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-extract.mir @@ -177,3 +177,45 @@ S_ENDPGM 0, implicit %1, implicit %2 ... + +--- +name: extract_sgpr_s96_from_s128 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-LABEL: name: extract_sgpr_s96_from_s128 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:sreg_96 = COPY [[COPY1]].sub0_sub1_sub2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_96 = COPY [[COPY]].sub1_sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]] + %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s96) = G_EXTRACT %0, 0 + %2:sgpr(s96) = G_EXTRACT %0, 32 + S_ENDPGM 0, implicit %1, implicit %2 + +... + +--- +name: extract_sgpr_v3s32_from_v4s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-LABEL: name: extract_sgpr_v3s32_from_v4s32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:sreg_96 = COPY [[COPY1]].sub0_sub1_sub2 + ; CHECK: [[COPY3:%[0-9]+]]:sreg_96 = COPY [[COPY]].sub1_sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]] + %0:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(<3 x s32>) = G_EXTRACT %0, 0 + %2:sgpr(<3 x s32>) = G_EXTRACT %0, 32 + S_ENDPGM 0, implicit %1, implicit %2 + +...