Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -139,27 +139,6 @@ return new SIFoldOperands(); } -static bool isFoldableCopy(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -936,7 +915,7 @@ tryFoldInst(TII, &MI); - if (!isFoldableCopy(MI)) { + if (!TII->isFoldableCopy(MI)) { if (IsIEEEMode || !tryFoldOMod(MI)) tryFoldClamp(MI); continue; Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -222,6 +222,8 @@ areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; + bool isFoldableCopy(const MachineInstr &MI) const; + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1488,6 +1488,27 @@ } } +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -51,6 +51,8 @@ std::unordered_map> SDWAOperands; + bool isImm(const MachineOperand &Op, int64_t &Value); + public: static char ID; @@ -375,6 +377,35 @@ return true; } +bool SIPeepholeSDWA::isImm(const MachineOperand &Op, int64_t &Value) { + if (Op.isImm()) { + Value = Op.getImm(); + return true; + } + + // If this is not immediate then it can be copy of immediate value, e.g.: + // %vreg1 = S_MOV_B32 255; + if (Op.isReg()) { + for (const MachineOperand &Def: MRI->def_operands(Op.getReg())) { + if (!isSameReg(Op, Def)) + continue; + + const MachineInstr *DefInst = Def.getParent(); + if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst)) + return false; + + const MachineOperand &Copied = DefInst->getOperand(1); + if (!Copied.isImm()) + return false; + + Value = Copied.getImm(); + return true; + } + } + + return false; +} + void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { for (MachineInstr &MI : MBB) { unsigned Opcode = MI.getOpcode(); @@ -391,10 +422,10 @@ // from: v_lshlrev_b32_e32 v1, 16/24, v0 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm()) + int64_t Imm = 0; + if (!isImm(*Src0, Imm)) break; - int64_t Imm = Src0->getImm(); if (Imm != 16 && Imm != 24) break; @@ -433,7 +464,8 @@ // from: v_lshlrev_b16_e32 v1, 8, v0 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm() || Src0->getImm() != 8) + int64_t Imm = 0; + if (!isImm(*Src0, Imm) || Imm != 8) break; MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); @@ -477,14 +509,14 @@ // 24 | 8 | BYTE_3 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (!Src1->isImm()) + int64_t Offset = 0; + if (!isImm(*Src1, Offset)) break; - int64_t Offset = Src1->getImm(); MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (!Src2->isImm()) + int64_t Width = 0; + if (!isImm(*Src2, Width)) break; - int64_t Width = Src2->getImm(); SdwaSel SrcSel = DWORD; @@ -526,10 +558,10 @@ // to SDWA src:v0 src_sel:WORD_0/BYTE_0 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (!Src0->isImm()) + int64_t Imm = 0; + if (!isImm(*Src0, Imm)) break; - int64_t Imm = Src0->getImm(); if (Imm != 0x0000ffff && Imm != 0x000000ff) break; Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -72,8 +72,9 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -92,10 +93,12 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -114,14 +117,18 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}} +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -155,7 +162,9 @@ ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: @@ -176,7 +185,8 @@ ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -199,10 +209,10 @@ ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -347,26 +357,28 @@ ret void } -; GCN-LABEL: {{^}}mul_add_v2i16: +; GCN-LABEL: {{^}}mul_add_shr_i32: ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; NOSDWA-NOT: v_add_i32_sdwa ; SDWA-NOT: v_mul_u32_u24_sdwa ; SDWA-NOT: v_add_i32_sdwa -define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) { +define void @mul_add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ina, i32 addrspace(1)* %inb, i1 addrspace(1)* %incond) { entry: - %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 - %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %a = load i32, i32 addrspace(1)* %ina, align 4 + %b = load i32, i32 addrspace(1)* %inb, align 4 %cond = load i1, i1 addrspace(1)* %incond, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 br i1 %cond, label %mul_label, label %add_label mul_label: - %mul = mul <2 x i16> %a, %b + %mul = mul i32 %shra, %shrb br label %store_label add_label: - %add = add <2 x i16> %a, %b + %add = add i32 %shra, %shrb br label %store_label store_label: - %store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label] - store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4 + %store = phi i32 [%mul, %mul_label], [%add, %add_label] + store i32 %store, i32 addrspace(1)* %out, align 4 ret void } \ No newline at end of file