Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -37,6 +37,7 @@ FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); @@ -58,6 +59,9 @@ void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIPeepholeSDWAPass(PassRegistry &); +extern char &SIPeepholeSDWAID; + void initializeSIShrinkInstructionsPass(PassRegistry&); extern char &SIShrinkInstructionsID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -93,6 +93,11 @@ cl::init(false), cl::Hidden); +static cl::opt EnableSDWAPeephole( + "amdgpu-sdwa-peephole", + cl::desc("Enable SDWA peepholer"), + cl::init(false)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -103,6 +108,7 @@ initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); + initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); @@ -663,6 +669,10 @@ void GCNPassConfig::addPreRegAlloc() { addPass(createSIShrinkInstructionsPass()); + if (EnableSDWAPeephole) { + addPass(&SIPeepholeSDWAID); + addPass(&DeadMachineInstructionElimID); + } addPass(createSIWholeQuadModePass()); } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -88,6 +88,7 @@ SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIOptimizeExecMasking.cpp + SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -763,6 +763,9 @@ int getVOPe32(uint16_t Opcode); LLVM_READONLY + int getSDWAOp(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); LLVM_READONLY Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1237,6 +1237,15 @@ let ValueCols = [["4", "0"]]; } +// Maps ordinary instructions to their SDWA counterparts +def getSDWAOp : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["SDWA"]]; +} + def getMaskedMIMGOp : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; Index: lib/Target/AMDGPU/SIPeepholeSDWA.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -0,0 +1,498 @@ +//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass tries to apply several peephole SDWA patterns. +/// +/// E.g. original: +/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 +/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 +/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// +/// Replace: +/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +/// +//===----------------------------------------------------------------------===// + +#include + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-peephole-sdwa" + +STATISTIC(NumSDWAPatternsFound, + "Number of SDWA patterns found."); +STATISTIC(NumSDWAInstructionsPeepholed, + "Number of instruction converted to SDWA."); + +namespace { + +class SDWAOperand; + +class SIPeepholeSDWA : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + std::unordered_map> SDWAOperands; + + +public: + static char ID; + + typedef SmallVector, 4> SDWAOperandsVector; + + SIPeepholeSDWA() : MachineFunctionPass(ID) { + initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void matchSDWAOperands(MachineBasicBlock &MBB); + bool isConvertibleToSDWA(MachineInstr &MI); + void convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); + + StringRef getPassName() const override { return "SI Peephole SDWA"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +class SDWAOperand { +private: + MachineOperand *Target; // Operand that would be used in converted instruction + MachineOperand *Source; // Operand that would be replace by Target + +public: + SDWAOperand(MachineOperand *TargetOp, MachineOperand *SourceOp) + : Target(TargetOp), Source(SourceOp) { + assert(Target->isReg()); + assert(Source->isReg()); + } + + virtual ~SDWAOperand() {} + + virtual SmallVector potentialsToConvert() = 0; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; + + MachineOperand *getTargetOperand() const { return Target; } + MachineOperand *getSourceOperand() const { return Source; } + MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const; +}; + +using namespace AMDGPU::SDWA; + +class SDWASrcOperand : public SDWAOperand { +private: + SdwaSel SrcSel; + +public: + SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *SourceOp, + SdwaSel SrcSel_ = DWORD) + : SDWAOperand(TargetOp, SourceOp), SrcSel(SrcSel_) {} + + virtual SmallVector potentialsToConvert() override; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getSrcSel() const { return SrcSel; } +}; + +class SDWADstOperand : public SDWAOperand { +private: + SdwaSel DstSel; + DstUnused DstUn; + +public: + SDWADstOperand( + MachineOperand *TargetOp, MachineOperand *SourceOp, + SdwaSel DstSel_ = DWORD, + DstUnused DstUn_ = UNUSED_PAD) + : SDWAOperand(TargetOp, SourceOp), DstSel(DstSel_), DstUn(DstUn_) {} + + virtual SmallVector potentialsToConvert() override; + virtual void convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + SdwaSel getDstSel() const { return DstSel; } + DstUnused getDstUnused() const { return DstUn; } +}; + +inline raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { + switch(Sel) { + case BYTE_0: OS << "BYTE_0"; break; + case BYTE_1: OS << "BYTE_1"; break; + case BYTE_2: OS << "BYTE_2"; break; + case BYTE_3: OS << "BYTE_3"; break; + case WORD_0: OS << "WORD_0"; break; + case WORD_1: OS << "WORD_1"; break; + case DWORD: OS << "DWORD"; break; + default: llvm_unreachable("Wrong SDWA selector"); + } + return OS; +} + +inline raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { + switch(Un) { + case UNUSED_PAD: OS << "UNUSED_PAD"; break; + case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; + case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; + default: llvm_unreachable("Wrong SDWA dst_unused"); + } + return OS; +} + +inline raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { + OS << "SDWA src:" << *Src.getTargetOperand() + << " src_sel:" << Src.getSrcSel() << "\n"; + return OS; +} + +inline raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { + OS << "SDWA dst:" << *Dst.getTargetOperand() + << " dst_sel:" << Dst.getDstSel() + << " dst_unused:" << Dst.getDstUnused() << "\n"; + return OS; +} + +bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) { + assert(FirstMI && SecondMI); + const MachineBasicBlock *FirstBB = FirstMI->getParent(); + const MachineBasicBlock *SecondBB = SecondMI->getParent(); + assert(FirstBB && SecondBB); + const MachineFunction *FirstFN = FirstBB->getParent(); + const MachineFunction *SecondFN = SecondBB->getParent(); + if (FirstFN && SecondFN && + FirstFN->getFunctionNumber() == SecondFN->getFunctionNumber()) { + int FirstBBNum = FirstMI->getParent()->getNumber(); + int SecondBBNum = SecondMI->getParent()->getNumber(); + if (FirstBBNum >= 0 && SecondBBNum >= 0 && FirstBBNum == SecondBBNum) + return true; + } + return false; +} + +} // End anonymous namespace. + +INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) + +char SIPeepholeSDWA::ID = 0; + +char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; + +FunctionPass *llvm::createSIPeepholeSDWAPass() { + return new SIPeepholeSDWA(); +} + + + +MachineRegisterInfo *SDWAOperand::getMRI() const { + if (MachineInstr *MI = getParentInst()) { + if (MachineBasicBlock *MBB = getParentInst()->getParent()) + return &MBB->getParent()->getRegInfo(); + } + return nullptr; +} + +SmallVector SDWASrcOperand::potentialsToConvert() { + // For SDWA src operand potential instructions are those that use register + // defined by parent instruction + SmallVector Potentials; + MachineInstr *ParentMI = getParentInst(); + assert(ParentMI); + if (MachineRegisterInfo *MRI = getMRI()) { + for (MachineOperand Def: ParentMI->defs()) { + for (MachineInstr &PotentialMI: MRI->use_instructions(Def.getReg())) { + // Check if this instructions are in same basic block + if (isSameBB(ParentMI, &PotentialMI)) { + Potentials.push_back(&PotentialMI); + } + } + } + } + return Potentials; +} + +void SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Find operand in instruction that matches source operand and replace it with + // target operand. Set corresponding src_sel + for (MachineOperand &Operand: MI.explicit_uses()) { + if (!Operand.isReg() || Operand.getReg() != getSourceOperand()->getReg()) { + continue; + } + + // Find src_sel for this operand + MachineOperand *SrcSel = nullptr; + unsigned OperandIdx = MI.getOperandNo(&Operand); + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst) == -1) { + // for VOPC OperandIdx should be either 1 for src0 or 3 for src1 + switch (OperandIdx) { + case 1: SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); break; + case 3: SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); break; + default: continue; + } + } else { + // for non-VOPC OperandIdx should be either 2 for src0 or 4 for src1 + switch (OperandIdx) { + case 2: SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); break; + case 4: SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); break; + default: continue; + } + } + if (!SrcSel) + continue; + + Operand.setReg(getTargetOperand()->getReg()); + SrcSel->setImm(getSrcSel()); + } +} + +SmallVector SDWADstOperand::potentialsToConvert() { + // For SDWA dst operand potential instruction is one that defines register + // that this operand uses + MachineInstr *ParentMI = getParentInst(); + assert(ParentMI); + MachineOperand *Source = getSourceOperand(); + assert(Source->isReg()); + + if (MachineRegisterInfo *MRI = getMRI()) { + MachineInstr *PotentialMI = MRI->getVRegDef(Source->getReg()); + assert(PotentialMI); + // Check if this instructions are in same basic block + if (isSameBB(ParentMI, PotentialMI)) { + // Check that ParentMI is the only instruction that uses src register + int dist = std::distance(MRI->use_instr_begin(Source->getReg()), + MRI->use_instr_end()); + if (dist == 1) { + return SmallVector(1, PotentialMI); + } + } + } + return SmallVector(); +} + +void SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { + // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused + MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert( + Operand && + Operand->isReg() && + Operand->getReg() == getSourceOperand()->getReg()); + Operand->setReg(getTargetOperand()->getReg()); + MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + assert(DstSel); + DstSel->setImm(getDstSel()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + assert(DstUnused); + DstUnused->setImm(getDstUnused()); + + // Remove original instruction because it would conflict with our new + // instruction by register definition + getParentInst()->eraseFromParent(); +} + + +void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { + // FIXME: It is possible to simplify this patterns because this pass is runnig + // after SIShrinkInstruction => we can get rid of _e64 instructions and we + // don't have to check for VGPRs + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch(Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: { + // from: v_lshrrev_b32_e32 v1, 16, v0 + // to SDWA src: v0 src_sel:WORD_1 + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm() || Src0->getImm() != 16) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isReg() || !TRI->isVGPR(*MRI, Src1->getReg())) + break; + + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (!Dst->isReg() || !TRI->isVGPR(*MRI, Dst->getReg())) + break; + + auto SDWASrc = std::make_shared(Src1, Dst, WORD_1); + dbgs() << "Match: " << MI << "To: " << *SDWASrc; + SDWAOperands[&MI] = std::move(SDWASrc); + ++NumSDWAPatternsFound; + break; + } + + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHLREV_B32_e64: { + // from: v_lshlrev_b32_e32 v1, 16, v0 + // to SDWA dst: v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (!Dst->isReg() || !TRI->isVGPR(*MRI, Dst->getReg())) + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (!Src0->isImm() || Src0->getImm() != 16) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isReg() || !TRI->isVGPR(*MRI, Src1->getReg())) + break; + auto SDWADst = + std::make_shared(Dst, Src1, WORD_1, UNUSED_PAD); + dbgs() << "Match: " << MI << "To: " << *SDWADst; + SDWAOperands[&MI] = std::move(SDWADst); + ++NumSDWAPatternsFound; + break; + } + + default: continue; + } + } +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI) { + // Check if this instruction can be converted to SDWA: + // 1. Does this opcode support SDWA + if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1) + return false; + + // 2. Are all operands - VGPRs + for (MachineOperand &Operand: MI.explicit_operands()) { + if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg())) + return false; + } + + return true; +} + +void SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); + assert(SDWAOpcode != -1); + + const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); + + // Create SDWA version of instruction MI and initialize its operands + MachineInstrBuilder SDWAInst = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),SDWADesc); + + // Copy dst, if it is present in original then should also be present in SDWA + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (Dst) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1); + SDWAInst.add(*Dst); + } else { + assert(TII->isVOPC(MI)); + } + + // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and + // src0_modifiers (except for v_nop_sdwa, but it can't get here) + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert( + Src0 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src0); + + // Copy src1 if present, initialize src1_modifiers. + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1); + SDWAInst.addImm(0); + SDWAInst.add(*Src1); + } else { + assert(TII->isVOP1(MI)); + } + + // Initialize clamp. + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1); + SDWAInst.addImm(0); + + // Initialize dst_sel and dst_unused if present + if (Dst) { + assert( + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 && + AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } + + // Initialize src0_sel + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + + + // Initialize src1_sel if present + if (Src1) { + assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } + + // Apply all sdwa operand pattenrs + for (std::shared_ptr Operand: SDWAOperands) { + Operand->convertToSDWA(*SDWAInst, TII); + } + + dbgs() << "Convert instruction:" << MI; + dbgs() << "Into:" << *SDWAInst << '\n'; + ++NumSDWAInstructionsPeepholed; + + MI.eraseFromParent(); +} + +bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + + if (!ST.hasSDWA() || + !AMDGPU::isVI(ST)) { // FIXME: We don't support SDWA anywhere other than VI + return false; + } + + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + + std::unordered_map PotentialMatches; + + // FIXME: For now we only combine instructions in one basic block + for (MachineBasicBlock &MBB: MF) { + SDWAOperands.clear(); + matchSDWAOperands(MBB); + + PotentialMatches.clear(); + for (auto OperandPair: SDWAOperands) { + auto Operand = OperandPair.second; + for (MachineInstr *PotentialMI: OperandPair.second->potentialsToConvert()) { + PotentialMatches[PotentialMI].push_back(Operand); + } + } + + for (auto PotentialPair: PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + if (isConvertibleToSDWA(PotentialMI)) { + convertToSDWA(PotentialMI, PotentialPair.second); + } + } + + } + return false; +} Index: lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- lib/Target/AMDGPU/VOPInstructions.td +++ lib/Target/AMDGPU/VOPInstructions.td @@ -267,7 +267,7 @@ let SDWA = 1; let Uses = [EXEC]; - let SubtargetPredicate = HasSDWA; + let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, AMDGPUAsmVariants.Disable); Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -0,0 +1,66 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}shr0: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @shr0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %add = add i32 %a, %shr + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}shr1: +; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + +define void @shr1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %a = load i32, i32 addrspace(1)* %in, align 4 + %shr = lshr i32 %a, 16 + %add = add i32 %shr, %a + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}shr01: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v[[DST1]], v[[DST0]] + +; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +define void @shr01(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { + %a = load i32, i32 addrspace(1)* %in1, align 4 + %b = load i32, i32 addrspace(1)* %in2, align 4 + %shra = lshr i32 %a, 16 + %shrb = lshr i32 %b, 16 + %add = add i32 %shra, %shrb + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}vector2x16: +; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} + +; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}} + +define void @vector2x16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +entry: + %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 + %mul = mul <2 x i16> %a, %b + store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4 + ret void +}