Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include +#include using namespace llvm; @@ -44,26 +45,29 @@ class SDWAOperand; class SIPeepholeSDWA : public MachineFunctionPass { +public: + typedef SmallVector SDWAOperandsVector; + private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; std::unordered_map> SDWAOperands; + std::unordered_map PotentialMatches; Optional foldToImm(const MachineOperand &Op) const; public: static char ID; - typedef SmallVector, 4> SDWAOperandsVector; - SIPeepholeSDWA() : MachineFunctionPass(ID) { initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); + bool isConvertibleToSDWA(const MachineInstr &MI) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -468,7 +472,7 @@ if (Opcode == AMDGPU::V_LSHLREV_B16_e32) { auto SDWADst = - make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); + make_unique(Dst, Src1, BYTE_1, UNUSED_PAD); DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); SDWAOperands[&MI] = std::move(SDWADst); ++NumSDWAPatternsFound; @@ -575,8 +579,7 @@ } } -bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, - const SDWAOperandsVector &SDWAOperands) { +bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const { // Check if this instruction can be converted to SDWA: // 1. Does this opcode support SDWA if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) @@ -588,6 +591,11 @@ return false; } + return true; +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { // Convert to sdwa int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); assert(SDWAOpcode != -1); @@ -664,7 +672,18 @@ // Apply all sdwa operand pattenrs bool Converted = false; for (auto &Operand : SDWAOperands) { - Converted |= Operand->convertToSDWA(*SDWAInst, TII); + // There should be no intesection between SDWA operands and potential MIs + // e.g.: + // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 + // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 + // v_add_u32 v3, v4, v2 + // + // In that example it is possible that we would fold 2nd instruction into 3rd + // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was + // already destroyed). So if SDWAOperand is also a potential MI then do not + // apply it. + if (PotentialMatches.count(Operand->getParentInst()) == 0) + Converted |= Operand->convertToSDWA(*SDWAInst, TII); } if (!Converted) { SDWAInst->eraseFromParent(); @@ -690,16 +709,15 @@ MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - - std::unordered_map PotentialMatches; - + + // Find all SDWA operands in MF. matchSDWAOperands(MF); - for (auto &OperandPair : SDWAOperands) { - auto &Operand = OperandPair.second; + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI) { - PotentialMatches[PotentialMI].push_back(std::move(Operand)); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); } } @@ -708,6 +726,7 @@ convertToSDWA(PotentialMI, PotentialPair.second); } + PotentialMatches.clear(); SDWAOperands.clear(); return false; } Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -393,3 +393,53 @@ store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4 ret void } + + +; Check that "pulling out" SDWA operands works correctly. +; GCN-LABEL: {{^}}pulled_out_test: +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_and_b32_sdwa +; NOSDWA-NOT: v_or_b32_sdwa + +; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) { +entry: + %idxprom = ashr exact i64 15, 32 + %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom + %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8 + + %tmp1 = extractelement <8 x i8> %tmp, i32 0 + %tmp2 = extractelement <8 x i8> %tmp, i32 1 + %tmp3 = extractelement <8 x i8> %tmp, i32 2 + %tmp4 = extractelement <8 x i8> %tmp, i32 3 + %tmp5 = extractelement <8 x i8> %tmp, i32 4 + %tmp6 = extractelement <8 x i8> %tmp, i32 5 + %tmp7 = extractelement <8 x i8> %tmp, i32 6 + %tmp8 = extractelement <8 x i8> %tmp, i32 7 + + %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0 + %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1 + %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0 + %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1 + %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0 + %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1 + %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0 + %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1 + + %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> + %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> + %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> + + %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom + store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 + ret void +}