Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -679,6 +679,74 @@ return; } + if (RC == &AMDGPU::VGPR_LO16RegClass || RC == &AMDGPU::VGPR_HI16RegClass) { + assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || + AMDGPU::VGPR_HI16RegClass.contains(SrcReg)); + + // d s + // l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2 + // llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2 + // l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d + // llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2 + // h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d + // 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2 + // h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2 + // llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2 + + bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass; + bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg); + DestReg = RI.getMatchingSuperReg(DestReg, + DstLow ? AMDGPU::lo16 : AMDGPU::hi16, + &AMDGPU::VGPR_32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, + SrcLow ? AMDGPU::lo16 : AMDGPU::hi16, + &AMDGPU::VGPR_32RegClass); + + if (DestReg == SrcReg) { + // l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0] + // h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0] + if (DstLow == SrcLow) + return; + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg) + .addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0) + .addReg(DestReg, RegState::Undef) + .addImm(0) // src1_mod + .addImm(0) // src1 + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + + return; + } + + // Last instruction first: + auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg) + .addReg((SrcLow && !DstLow) ? SrcReg : DestReg, + (SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0) + .addReg((!SrcLow && DstLow) ? SrcReg : DestReg, + (!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0) + .addImm(2); + + unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32 + : SrcLow ? AMDGPU::V_LSHRREV_B32_e32 + : AMDGPU::V_LSHLREV_B32_e32; + auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg); + if (DstLow == SrcLow) { // alignbyte + First.addReg(SrcLow ? SrcReg : DestReg, + SrcLow ? getKillRegState(KillSrc) : RegState::Undef) + .addReg(SrcLow ? DestReg : SrcReg, + SrcLow ? RegState::Undef :getKillRegState(KillSrc)) + .addImm(2); + } else { + First.addImm(16) + .addReg(DestReg, RegState::Undef); + } + + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1305,6 +1305,8 @@ assert(!Register::isVirtualRegister(Reg)); static const TargetRegisterClass *const BaseClasses[] = { + &AMDGPU::VGPR_LO16RegClass, + &AMDGPU::VGPR_HI16RegClass, &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::AGPR_32RegClass, @@ -1344,6 +1346,9 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { unsigned Size = getRegSizeInBits(*RC); switch (Size) { + case 16: + return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr || + getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr; case 32: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; case 64: Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -259,18 +259,10 @@ // VGPR registers foreach Index = 0-255 in { - // There is no special encoding for low 16 bit subreg, this not a real - // register but rather an operand for instructions preserving high 16 bits - // of the result or reading just low 16 bits of a 32 bit VGPR. - // It is encoded as a corresponding 32 bit register. def VGPR#Index#_LO16 : SIReg <"v"#Index#".l", Index>, DwarfRegNum<[!add(Index, 2560)]> { let HWEncoding{8} = 1; } - // There is no special encoding for low 16 bit subreg, this not a real - // register but rather an operand for instructions preserving low 16 bits - // of the result or reading just high 16 bits of a 32 bit VGPR. - // It is encoded as a corresponding 32 bit register. def VGPR#Index#_HI16 : SIReg <"v"#Index#".h", Index>, DwarfRegNum<[!add(Index, 2560)]> { let HWEncoding{8} = 1; Index: llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir @@ -0,0 +1,202 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: {{^}}lo_to_lo: +# GCN: v_alignbyte_b32 v1, v0, v1, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +name: lo_to_lo +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_lo16 = COPY $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_hi: +# GCN: v_lshrrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +name: lo_to_hi +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_hi16 = COPY killed $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_lo: +# GCN: v_lshlrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +name: hi_to_lo +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_lo16 = COPY $vgpr0_hi16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_hi: +# GCN: v_alignbyte_b32 v1, v1, v0, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +name: hi_to_hi +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_hi16 = COPY $vgpr0_hi16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_lo_samereg: +# GCN: s_waitcnt +# GCN-NEXT: s_endpgm +name: lo_to_lo_samereg +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr0_lo16 = COPY $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_hi_samereg: +# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0] +name: lo_to_hi_samereg +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr0_hi16 = COPY $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_lo_samereg: +# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0] +name: hi_to_lo_samereg +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr0_lo16 = COPY killed $vgpr0_hi16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_hi_samereg: +# GCN: s_waitcnt +# GCN-NEXT: s_endpgm +name: hi_to_hi_samereg +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr0_hi16 = COPY killed $vgpr0_hi16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_lo_def_livein: +# GCN: v_alignbyte_b32 v1, v0, v1, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +name: lo_to_lo_def_livein +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + $vgpr1 = IMPLICIT_DEF + $vgpr1_lo16 = COPY $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_hi_def_livein: +# GCN: v_lshrrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +name: lo_to_hi_def_livein +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + $vgpr1 = IMPLICIT_DEF + $vgpr1_hi16 = COPY $vgpr0_lo16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_lo_def_livein: +# GCN: v_lshlrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +name: hi_to_lo_def_livein +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + $vgpr1 = IMPLICIT_DEF + $vgpr1_lo16 = COPY killed $vgpr0_hi16 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}hi_to_hi_def_livein: +# GCN: v_alignbyte_b32 v1, v1, v0, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +name: hi_to_hi_def_livein +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + $vgpr1 = IMPLICIT_DEF + $vgpr1_hi16 = COPY $vgpr0_hi16 + S_ENDPGM 0 +... + +# TODO: This can be coalesced into a VGPR_32 copy +# GCN-LABEL: {{^}}lo_to_lo_hi_to_hi: +# GCN: v_alignbyte_b32 v1, v0, v1, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN-NEXT: v_mov_b32_e32 v2, v1 +# GCN-NEXT: s_endpgm +name: lo_to_lo_hi_to_hi +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_lo16 = COPY $vgpr0_lo16 + $vgpr1_hi16 = COPY $vgpr0_hi16 + $vgpr2 = COPY killed $vgpr1 + S_ENDPGM 0 +... + +# GCN-LABEL: {{^}}lo_to_hi_hi_to_lo: +# GCN: v_lshlrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +# GCN-NEXT: v_mov_b32_e32 v2, v1 +# GCN-NEXT: s_endpgm +name: lo_to_hi_hi_to_lo +tracksRegLiveness: true +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $vgpr1_lo16 = COPY $vgpr0_hi16 + $vgpr1_hi16 = COPY $vgpr0_lo16 + $vgpr2 = COPY killed $vgpr1 + S_ENDPGM 0 +... + +# NB: copy of undef just killed instead of expansion +# GCN-LABEL: {{^}}lo_to_lo_undef: +# GCN: s_waitcnt +# GCN-NEXT: v_mov_b32_e32 v2, v1 +# GCN-NEXT: s_endpgm +name: lo_to_lo_undef +tracksRegLiveness: true +body: | + bb.0: + $vgpr1_lo16 = COPY undef $vgpr0_lo16 + $vgpr2 = COPY killed $vgpr1 + S_ENDPGM 0 +...