Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -683,16 +683,6 @@ assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg)); - // d s - // l -> l : hhhhxxxx : xxxxllll -> v_alignbyte_b32 d, s, d, 2 - // llllhhhh : xxxxllll -> v_alignbyte_b32 d, d, d, 2 - // l -> h : xxxxllll : xxxxhhhh -> v_lshlrev_b32 d, 16, d - // llll0000 : xxxxhhhh -> v_alignbyte_b32 d, s, d, 2 - // h -> l : hhhhxxxx : llllxxxx -> v_lshrrev_b32 d, 16, d - // 0000hhhh : llllxxxx -> v_alignbyte_b32 d, d, s, 2 - // h -> h : xxxxllll : hhhhxxxx -> v_alignbyte_b32 d, d, s, 2 - // llllhhhh : hhhhxxxx -> v_alignbyte_b32 d, d, d, 2 - bool DstLow = RC == &AMDGPU::VGPR_LO16RegClass; bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg); DestReg = RI.getMatchingSuperReg(DestReg, @@ -702,48 +692,15 @@ SrcLow ? AMDGPU::lo16 : AMDGPU::hi16, &AMDGPU::VGPR_32RegClass); - if (DestReg == SrcReg) { - // l -> h : v_pk_add_u16 v1, v1, 0 op_sel_hi:[0,0] - // h -> l : v_pk_add_u16 v1, v1, 0 op_sel:[1,0] op_sel_hi:[1,0] - if (DstLow == SrcLow) - return; - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_ADD_U16), DestReg) - .addImm(DstLow ? SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1 : 0) - .addReg(DestReg, RegState::Undef) - .addImm(0) // src1_mod - .addImm(0) // src1 - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0); - - return; - } - - // Last instruction first: - auto Last = BuildMI(MBB, MI, DL, get(AMDGPU::V_ALIGNBYTE_B32), DestReg) - .addReg((SrcLow && !DstLow) ? SrcReg : DestReg, - (SrcLow && !DstLow) ? getKillRegState(KillSrc) : 0) - .addReg((!SrcLow && DstLow) ? SrcReg : DestReg, - (!SrcLow && DstLow) ? getKillRegState(KillSrc) : 0) - .addImm(2); - - unsigned OpcFirst = (DstLow == SrcLow) ? AMDGPU::V_ALIGNBYTE_B32 - : SrcLow ? AMDGPU::V_LSHRREV_B32_e32 - : AMDGPU::V_LSHLREV_B32_e32; - auto First = BuildMI(MBB, &*Last, DL, get(OpcFirst), DestReg); - if (DstLow == SrcLow) { // alignbyte - First - .addReg(SrcLow ? SrcReg : DestReg, - SrcLow ? getKillRegState(KillSrc) : unsigned(RegState::Undef)) - .addReg(SrcLow ? DestReg : SrcReg, - SrcLow ? unsigned(RegState::Undef) : getKillRegState(KillSrc)) - .addImm(2); - } else { - First.addImm(16) - .addReg(DestReg, RegState::Undef); - } + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), DestReg) + .addImm(0) // src0_modifiers + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) // clamp + .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1) + .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) + .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1); return; } @@ -3519,25 +3476,33 @@ } } - const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); + const MachineOperand *DstUnused = + getNamedOperand(MI, AMDGPU::OpName::dst_unused); if (DstUnused && DstUnused->isImm() && DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { const MachineOperand &Dst = MI.getOperand(DstIdx); - if (!Dst.isReg() || !Dst.isTied()) { + if (!Dst.isReg()) { + ErrInfo = "Dst preserve used but dst is not a register"; + return false; + } + if (Dst.getReg().isVirtual() && !Dst.isTied()) { ErrInfo = "Dst register should have tied register"; return false; } - const MachineOperand &TiedMO = - MI.getOperand(MI.findTiedOperandIdx(DstIdx)); - if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { - ErrInfo = + if (Dst.isTied()) { + const MachineOperand &TiedMO = + MI.getOperand(MI.findTiedOperandIdx(DstIdx)); + if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { + ErrInfo = "Dst register should be tied to implicit use of preserved register"; - return false; - } else if (Register::isPhysicalRegister(TiedMO.getReg()) && - Dst.getReg() != TiedMO.getReg()) { - ErrInfo = "Dst register should use same physical register as preserved"; - return false; + return false; + } else if (Register::isPhysicalRegister(TiedMO.getReg()) && + Dst.getReg() != TiedMO.getReg()) { + ErrInfo = + "Dst register should use same physical register as preserved"; + return false; + } } } } Index: llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir +++ llvm/test/CodeGen/AMDGPU/lo16-hi16-physreg-copy.mir @@ -1,8 +1,9 @@ +# RUN: llc -march=amdgcn -mcpu=gfx802 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # RUN: llc -march=amdgcn -mcpu=gfx900 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -start-before postrapseudos -asm-verbose=0 -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s # GCN-LABEL: {{^}}lo_to_lo: -# GCN: v_alignbyte_b32 v1, v0, v1, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 name: lo_to_lo tracksRegLiveness: true body: | @@ -13,8 +14,7 @@ ... # GCN-LABEL: {{^}}lo_to_hi: -# GCN: v_lshrrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 name: lo_to_hi tracksRegLiveness: true body: | @@ -25,8 +25,7 @@ ... # GCN-LABEL: {{^}}hi_to_lo: -# GCN: v_lshlrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 name: hi_to_lo tracksRegLiveness: true body: | @@ -37,8 +36,7 @@ ... # GCN-LABEL: {{^}}hi_to_hi: -# GCN: v_alignbyte_b32 v1, v1, v0, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 name: hi_to_hi tracksRegLiveness: true body: | @@ -49,8 +47,9 @@ ... # GCN-LABEL: {{^}}lo_to_lo_samereg: -# GCN: s_waitcnt -# GCN-NEXT: s_endpgm +# GCN: s_waitcnt +# GFX10-NEXT: s_waitcnt_vscnt +# GCN-NEXT: s_endpgm name: lo_to_lo_samereg tracksRegLiveness: true body: | @@ -61,7 +60,7 @@ ... # GCN-LABEL: {{^}}lo_to_hi_samereg: -# GCN: v_pk_add_u16 v0, v0, 0 op_sel_hi:[0,0] +# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 name: lo_to_hi_samereg tracksRegLiveness: true body: | @@ -72,7 +71,7 @@ ... # GCN-LABEL: {{^}}hi_to_lo_samereg: -# GCN: v_pk_add_u16 v0, v0, 0 op_sel:[1,0] op_sel_hi:[1,0] +# GCN: v_mov_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 name: hi_to_lo_samereg tracksRegLiveness: true body: | @@ -84,6 +83,7 @@ # GCN-LABEL: {{^}}hi_to_hi_samereg: # GCN: s_waitcnt +# GFX10-NEXT: s_waitcnt_vscnt # GCN-NEXT: s_endpgm name: hi_to_hi_samereg tracksRegLiveness: true @@ -95,8 +95,7 @@ ... # GCN-LABEL: {{^}}lo_to_lo_def_livein: -# GCN: v_alignbyte_b32 v1, v0, v1, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 name: lo_to_lo_def_livein tracksRegLiveness: true body: | @@ -109,8 +108,7 @@ ... # GCN-LABEL: {{^}}lo_to_hi_def_livein: -# GCN: v_lshrrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 name: lo_to_hi_def_livein tracksRegLiveness: true body: | @@ -123,8 +121,7 @@ ... # GCN-LABEL: {{^}}hi_to_lo_def_livein: -# GCN: v_lshlrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 name: hi_to_lo_def_livein tracksRegLiveness: true body: | @@ -137,8 +134,7 @@ ... # GCN-LABEL: {{^}}hi_to_hi_def_livein: -# GCN: v_alignbyte_b32 v1, v1, v0, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 name: hi_to_hi_def_livein tracksRegLiveness: true body: | @@ -152,10 +148,8 @@ # TODO: This can be coalesced into a VGPR_32 copy # GCN-LABEL: {{^}}lo_to_lo_hi_to_hi: -# GCN: v_alignbyte_b32 v1, v0, v1, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 # GCN-NEXT: v_mov_b32_e32 v2, v1 # GCN-NEXT: s_endpgm name: lo_to_lo_hi_to_hi @@ -170,10 +164,8 @@ ... # GCN-LABEL: {{^}}lo_to_hi_hi_to_lo: -# GCN: v_lshlrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v1, v0, 2 -# GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -# GCN-NEXT: v_alignbyte_b32 v1, v0, v1, 2 +# GCN: v_mov_b32_sdwa v1, v0 dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 +# GCN-NEXT: v_mov_b32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 # GCN-NEXT: v_mov_b32_e32 v2, v1 # GCN-NEXT: s_endpgm name: lo_to_hi_hi_to_lo @@ -189,9 +181,10 @@ # NB: copy of undef just killed instead of expansion # GCN-LABEL: {{^}}lo_to_lo_undef: -# GCN: s_waitcnt -# GCN-NEXT: v_mov_b32_e32 v2, v1 -# GCN-NEXT: s_endpgm +# GCN: s_waitcnt +# GFX10-NEXT: s_waitcnt_vscnt +# GCN-NEXT: v_mov_b32_e32 v2, v1 +# GCN-NEXT: s_endpgm name: lo_to_lo_undef tracksRegLiveness: true body: |