Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1292,9 +1292,6 @@ Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI->getType(DstReg); const LLT SrcTy = MRI->getType(SrcReg); - if (!DstTy.isScalar()) - return false; - const LLT S1 = LLT::scalar(1); const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); @@ -1309,6 +1306,8 @@ return false; } + const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; + unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); @@ -1317,6 +1316,71 @@ const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI); + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { + LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + return false; + } + + if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) { + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + + Register LoReg = MRI->createVirtualRegister(DstRC); + Register HiReg = MRI->createVirtualRegister(DstRC); + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) + .addReg(SrcReg, 0, AMDGPU::sub0); + BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) + .addReg(SrcReg, 0, AMDGPU::sub1); + + if (IsVALU && STI.hasSDWA()) { + // Write the low 16-bits of the high element into the high 16-bits of the + // low element. + MachineInstr *MovSDWA = + BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) + .addImm(0) // $src0_modifiers + .addReg(HiReg) // $src0 + .addImm(0) // $clamp + .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel + .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused + .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel + .addReg(LoReg, RegState::Implicit); + MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); + } else { + Register TmpReg0 = MRI->createVirtualRegister(DstRC); + Register TmpReg1 = MRI->createVirtualRegister(DstRC); + Register ImmReg = MRI->createVirtualRegister(DstRC); + if (IsVALU) { + BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) + .addImm(16) + .addReg(HiReg); + } else { + BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) + .addReg(HiReg) + .addImm(16); + } + + unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; + unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; + + BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) + .addReg(LoReg) + .addReg(ImmReg); + BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) + .addReg(TmpReg0) + .addReg(TmpReg1); + } + + I.eraseFromParent(); + return true; + } + + if (!DstTy.isScalar()) + return false; + if (SrcSize > 32) { int SubRegIdx = sizeToSubRegIndex(DstSize); if (SubRegIdx == -1) @@ -1324,17 +1388,17 @@ // Deal with weird cases where the class only partially supports the subreg // index. - SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); - if (!SrcRC) + const TargetRegisterClass *SrcWithSubRC + = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); + if (!SrcWithSubRC) return false; - I.getOperand(1).setSubReg(SubRegIdx); - } + if (SrcWithSubRC != SrcRC) { + if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) + return false; + } - if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || - !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); - return false; + I.getOperand(1).setSubReg(SubRegIdx); } I.setDesc(TII.get(TargetOpcode::COPY)); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.v2s16.mir @@ -0,0 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s + +--- + +name: trunc_sgpr_v2s32_to_v2s16 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX6-LABEL: name: trunc_sgpr_v2s32_to_v2s16 + ; GFX6: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX6: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX6: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; GFX6: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc + ; GFX6: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX8-LABEL: name: trunc_sgpr_v2s32_to_v2s16 + ; GFX8: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1 + ; GFX8: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def $scc + ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LSHL_B32_]], [[S_AND_B32_]], implicit-def $scc + ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]] + %0:sgpr(<2 x s32>) =COPY $sgpr0_sgpr1 + %1:sgpr(<2 x s16>) = G_TRUNC %0 + S_ENDPGM 0, implicit %1 +... + +--- + +name: trunc_vgpr_v2s32_to_v2s16 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX6-LABEL: name: trunc_vgpr_v2s32_to_v2s16 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX6: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX6: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_LSHLREV_B32_e64_]], [[V_AND_B32_e64_]], implicit $exec + ; GFX6: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX8-LABEL: name: trunc_vgpr_v2s32_to_v2s16 + ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; GFX8: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[COPY2]], 0, 5, 2, 4, implicit $exec, implicit [[COPY1]](tied-def 0) + ; GFX8: S_ENDPGM 0, implicit [[V_MOV_B32_sdwa]] + %0:vgpr(<2 x s32>) =COPY $vgpr0_vgpr1 + %1:vgpr(<2 x s16>) = G_TRUNC %0 + S_ENDPGM 0, implicit %1 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s + +define i16 @v_trunc_i32_to_i16(i32 %src) { +; GFX7-LABEL: v_trunc_i32_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_trunc_i32_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i32 %src to i16 + ret i16 %trunc +} + +define amdgpu_ps i16 @s_trunc_i32_to_i16(i32 inreg %src) { +; GFX7-LABEL: s_trunc_i32_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_trunc_i32_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: ; return to shader part epilog + %trunc = trunc i32 %src to i16 + ret i16 %trunc +} + +define i16 @v_trunc_i64_to_i16(i64 %src) { +; GFX7-LABEL: v_trunc_i64_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_trunc_i64_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i64 %src to i16 + ret i16 %trunc +} + +define amdgpu_ps i16 @s_trunc_i64_to_i16(i64 inreg %src) { +; GFX7-LABEL: s_trunc_i64_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_trunc_i64_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: ; return to shader part epilog + %trunc = trunc i64 %src to i16 + ret i16 %trunc +} + +define amdgpu_ps i16 @s_trunc_i128_to_i16(i128 inreg %src) { +; GFX7-LABEL: s_trunc_i128_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_trunc_i128_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: ; return to shader part epilog + %trunc = trunc i128 %src to i16 + ret i16 %trunc +} + +define i16 @v_trunc_i128_to_i16(i128 %src) { +; GFX7-LABEL: v_trunc_i64_to_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_trunc_i64_to_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i128 %src to i16 + ret i16 %trunc +} + +define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) { +; GFX7-LABEL: v_trunc_v2i32_to_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_trunc_v2i32_to_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc <2 x i32> %src to <2 x i16> + %cast = bitcast <2 x i16> %trunc to i32 + ret i32 %cast +} + +define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) { +; GFX7-LABEL: s_trunc_v2i32_to_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_trunc_v2i32_to_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: ; return to shader part epilog + %trunc = trunc <2 x i32> %src to <2 x i16> + %cast = bitcast <2 x i16> %trunc to i32 + ret i32 %cast +}