Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
@@ -37,6 +37,7 @@
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
@@ -58,6 +59,9 @@
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIPeepholeSDWAPass(PassRegistry &);
+extern char &SIPeepholeSDWAID;
+
 void initializeSIShrinkInstructionsPass(PassRegistry&);
 extern char &SIShrinkInstructionsID;
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -94,6 +94,11 @@
   cl::init(false),
   cl::Hidden);
 
+static cl::opt<bool> EnableSDWAPeephole(
+  "amdgpu-sdwa-peephole",
+  cl::desc("Enable SDWA peepholer"),
+  cl::init(false));
+
 // Enable address space based alias analysis
 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
   cl::desc("Enable AMDGPU Alias Analysis"),
@@ -109,6 +114,7 @@
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
+  initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
@@ -683,6 +689,10 @@
 
 void GCNPassConfig::addPreRegAlloc() {
   addPass(createSIShrinkInstructionsPass());
+  if (EnableSDWAPeephole) {
+    addPass(&SIPeepholeSDWAID);
+    addPass(&DeadMachineInstructionElimID);
+  }
   addPass(createSIWholeQuadModePass());
 }
 
Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
@@ -89,6 +89,7 @@
   SIMachineFunctionInfo.cpp
   SIMachineScheduler.cpp
   SIOptimizeExecMasking.cpp
+  SIPeepholeSDWA.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
@@ -769,6 +769,9 @@
   int getVOPe32(uint16_t Opcode);
 
   LLVM_READONLY
+  int getSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
   LLVM_READONLY
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1441,6 +1441,15 @@
   let ValueCols = [["4", "0"]];
 }
 
+// Maps ordinary instructions to their SDWA counterparts
+def getSDWAOp : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["SDWA"]];
+}
+
 def getMaskedMIMGOp : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
Index: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -0,0 +1,692 @@
+//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass tries to apply several peephole SDWA patterns.
+///
+/// E.g. original:
+///   V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
+///   V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
+///   V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
+///
+/// Replace:
+///   V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
+///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+///
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include <unordered_map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-peephole-sdwa"
+
+STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
+STATISTIC(NumSDWAInstructionsPeepholed,
+          "Number of instruction converted to SDWA.");
+
+namespace {
+
+class SDWAOperand;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+
+public:
+  static char ID;
+
+  typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
+
+  SIPeepholeSDWA() : MachineFunctionPass(ID) {
+    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void matchSDWAOperands(MachineBasicBlock &MBB);
+  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
+
+  StringRef getPassName() const override { return "SI Peephole SDWA"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class SDWAOperand {
+private:
+  MachineOperand *Target; // Operand that would be used in converted instruction
+  MachineOperand *Replaced; // Operand that would be replace by Target
+
+public:
+  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
+      : Target(TargetOp), Replaced(ReplacedOp) {
+    assert(Target->isReg());
+    assert(Replaced->isReg());
+  }
+
+  virtual ~SDWAOperand() {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
+
+  MachineOperand *getTargetOperand() const { return Target; }
+  MachineOperand *getReplacedOperand() const { return Replaced; }
+  MachineInstr *getParentInst() const { return Target->getParent(); }
+  MachineRegisterInfo *getMRI() const {
+    return &getParentInst()->getParent()->getParent()->getRegInfo();
+  }
+};
+
+using namespace AMDGPU::SDWA;
+
+class SDWASrcOperand : public SDWAOperand {
+private:
+  SdwaSel SrcSel;
+  bool Abs;
+  bool Neg;
+  bool Sext;
+
+public:
+  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
+                 bool Sext_ = false)
+      : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
+        Neg(Neg_), Sext(Sext_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getSrcSel() const { return SrcSel; }
+  bool getAbs() const { return Abs; }
+  bool getNeg() const { return Neg; }
+  bool getSext() const { return Sext; }
+
+  uint64_t getSrcMods() const;
+};
+
+class SDWADstOperand : public SDWAOperand {
+private:
+  SdwaSel DstSel;
+  DstUnused DstUn;
+
+public:
+  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
+      : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getDstSel() const { return DstSel; }
+  DstUnused getDstUnused() const { return DstUn; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+
+char SIPeepholeSDWA::ID = 0;
+
+char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+
+FunctionPass *llvm::createSIPeepholeSDWAPass() {
+  return new SIPeepholeSDWA();
+}
+
+#ifndef NDEBUG
+
+static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+  switch(Sel) {
+  case BYTE_0: OS << "BYTE_0"; break;
+  case BYTE_1: OS << "BYTE_1"; break;
+  case BYTE_2: OS << "BYTE_2"; break;
+  case BYTE_3: OS << "BYTE_3"; break;
+  case WORD_0: OS << "WORD_0"; break;
+  case WORD_1: OS << "WORD_1"; break;
+  case DWORD:  OS << "DWORD"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
+  switch(Un) {
+  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
+  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
+  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
+  OS << "SDWA src: " << *Src.getTargetOperand()
+     << " src_sel:" << Src.getSrcSel()
+     << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
+     << " sext:" << Src.getSext() << '\n';
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
+  OS << "SDWA dst: " << *Dst.getTargetOperand()
+     << " dst_sel:" << Dst.getDstSel()
+     << " dst_unused:" << Dst.getDstUnused() << '\n';
+  return OS;
+}
+
+#endif
+
+static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) {
+  assert(FirstMI && SecondMI);
+  return FirstMI->getParent() == SecondMI->getParent();
+}
+
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+  assert(To.isReg() && From.isReg());
+  To.setReg(From.getReg());
+  To.setSubReg(From.getSubReg());
+  To.setIsUndef(From.isUndef());
+  if (To.isUse()) {
+    To.setIsKill(From.isKill());
+  } else {
+    To.setIsDead(From.isDead());
+  }
+}
+
+static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
+  return LHS.isReg() &&
+         RHS.isReg() &&
+         LHS.getReg() == RHS.getReg() &&
+         LHS.getSubReg() == RHS.getSubReg();
+}
+
+static bool isSubregOf(const MachineOperand &SubReg,
+                       const MachineOperand &SuperReg,
+                       const TargetRegisterInfo *TRI) {
+  
+  if (!SuperReg.isReg() || !SubReg.isReg())
+    return false;
+
+  if (isSameReg(SuperReg, SubReg))
+    return true;
+
+  if (SuperReg.getReg() != SubReg.getReg())
+    return false;
+
+  LaneBitmask::Type SuperMask =
+      TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()).getAsInteger();
+  LaneBitmask::Type SubMask =
+      TRI->getSubRegIndexLaneMask(SubReg.getSubReg()).getAsInteger();
+  return TRI->regmaskSubsetEqual(&SubMask, &SuperMask);
+}
+
+uint64_t SDWASrcOperand::getSrcMods() const {
+  uint64_t Mods = 0;
+  if (Abs || Neg) {
+    assert(!Sext &&
+           "Float and integer src modifiers can't be set simulteniously");
+    Mods |= Abs ? SISrcMods::ABS : 0;
+    Mods |= Neg ? SISrcMods::NEG : 0;
+  } else if (Sext) {
+    Mods |= SISrcMods::SEXT;
+  }
+
+  return Mods;
+}
+
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA src operand potential instruction is one that use register
+  // defined by parent instruction
+  MachineRegisterInfo *MRI = getMRI();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  MachineInstr *PotentialMI = nullptr;
+  for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
+    // If this is use of another subreg of dst reg then do nothing
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    // If there exist use of dst in another basic block or use of superreg of
+    // dst then we should not combine this opernad
+    if (!isSameBB(PotentialMO.getParent(), getParentInst()) ||
+        !isSameReg(PotentialMO, *Replaced))
+      return nullptr;
+
+    // Check that PotentialMI is only instruction that uses dst reg
+    if (PotentialMI == nullptr) {
+      PotentialMI = PotentialMO.getParent();
+    } else if (PotentialMI != PotentialMO.getParent()) {
+      return nullptr;
+    }
+  }
+
+  return PotentialMI;
+}
+
+bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Find operand in instruction that matches source operand and replace it with
+  // target operand. Set corresponding src_sel
+
+  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+  MachineOperand *SrcMods =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+  assert(Src && Src->isReg());
+  if (!isSameReg(*Src, *getReplacedOperand())) {
+    // If this is not src0 then it should be src1
+    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+    assert(Src && Src->isReg());
+
+    if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
+      // src2. This is not allowed.
+      return false;
+    }
+
+    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+  }
+  copyRegOperand(*Src, *getTargetOperand());
+  SrcSel->setImm(getSrcSel());
+  SrcMods->setImm(getSrcMods());
+  getTargetOperand()->setIsKill(false);
+  return true;
+}
+
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA dst operand potential instruction is one that defines register
+  // that this operand uses
+  MachineRegisterInfo *MRI = getMRI();
+  MachineInstr *ParentMI = getParentInst();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    if (!isSameBB(getParentInst(), PotentialMO.getParent()) ||
+        !isSameReg(*Replaced, PotentialMO))
+      return nullptr;
+
+    // Check that ParentMI is the only instruction that uses replaced register
+    for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
+      if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
+          UseMO.getParent() != ParentMI) {
+        return nullptr;
+      }
+    }
+
+    // Due to SSA this should be onle def of replaced register, so return it
+    return PotentialMO.getParent();
+  }
+
+  return nullptr;
+}
+
+bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
+
+  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+      getDstSel() != AMDGPU::SDWA::DWORD) {
+    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
+    return false;
+  }
+
+  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  assert(Operand &&
+         Operand->isReg() &&
+         isSameReg(*Operand, *getReplacedOperand()));
+  copyRegOperand(*Operand, *getTargetOperand());
+  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel);
+  DstSel->setImm(getDstSel());
+  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused);
+  DstUnused->setImm(getDstUnused());
+
+  // Remove original instruction  because it would conflict with our new
+  // instruction by register definition
+  getParentInst()->eraseFromParent();
+  return true;
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
+  for (MachineInstr &MI : MBB) {
+    unsigned Opcode = MI.getOpcode();
+    switch (Opcode) {
+    case AMDGPU::V_LSHRREV_B32_e32:
+    case AMDGPU::V_ASHRREV_I32_e32:
+    case AMDGPU::V_LSHLREV_B32_e32: {
+      // from: v_lshrrev_b32_e32 v1, 16/24, v0
+      // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+      // from: v_ashrrev_i32_e32 v1, 16/24, v0
+      // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+      // from: v_lshlrev_b32_e32 v1, 16/24, v0
+      // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      if (!Src0->isImm())
+        break;
+
+      int64_t Imm = Src0->getImm();
+      if (Imm != 16 && Imm != 24)
+        break;
+
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      if (TRI->isPhysicalRegister(Src1->getReg()) ||
+          TRI->isPhysicalRegister(Dst->getReg()))
+        break;
+
+      if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
+        auto SDWADst = make_unique<SDWADstOperand>(
+            Dst, Src1, Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+        SDWAOperands[&MI] = std::move(SDWADst);
+        ++NumSDWAPatternsFound;
+      } else {
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src1, Dst, Imm == 16 ? WORD_1 : BYTE_3, false, false,
+            Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+      }
+      break;
+    }
+
+    case AMDGPU::V_LSHRREV_B16_e32:
+    case AMDGPU::V_ASHRREV_I16_e32:
+    case AMDGPU::V_LSHLREV_B16_e32: {
+      // from: v_lshrrev_b16_e32 v1, 8, v0
+      // to SDWA src:v0 src_sel:BYTE_1
+
+      // from: v_ashrrev_i16_e32 v1, 8, v0
+      // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+      // from: v_lshlrev_b16_e32 v1, 8, v0
+      // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      if (!Src0->isImm() || Src0->getImm() != 8)
+        break;
+
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+      if (TRI->isPhysicalRegister(Src1->getReg()) ||
+          TRI->isPhysicalRegister(Dst->getReg()))
+        break;
+
+      if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
+        auto SDWADst =
+            make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+        SDWAOperands[&MI] = std::move(SDWADst);
+        ++NumSDWAPatternsFound;
+      } else {
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src1, Dst, BYTE_1, false, false,
+            Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+      }
+      break;
+    }
+
+    case AMDGPU::V_BFE_I32:
+    case AMDGPU::V_BFE_U32: {
+      // e.g.:
+      // from: v_bfe_u32 v1, v0, 8, 8
+      // to SDWA src:v0 src_sel:BYTE_1
+
+      // offset | width | src_sel
+      // ------------------------
+      // 0      | 8     | BYTE_0
+      // 0      | 16    | WORD_0
+      // 0      | 32    | DWORD ?
+      // 8      | 8     | BYTE_1
+      // 16     | 8     | BYTE_2
+      // 16     | 16    | WORD_1
+      // 24     | 8     | BYTE_3
+
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      if (!Src1->isImm())
+        break;
+      int64_t Offset = Src1->getImm();
+
+      MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (!Src2->isImm())
+        break;
+      int64_t Width = Src2->getImm();
+
+      SdwaSel SrcSel = DWORD;
+
+      if (Offset == 0 && Width == 8)
+        SrcSel = BYTE_0;
+      else if (Offset == 0 && Width == 16)
+        SrcSel = WORD_0;
+      else if (Offset == 0 && Width == 32)
+        SrcSel = DWORD;
+      else if (Offset == 8 && Width == 8)
+        SrcSel = BYTE_1;
+      else if (Offset == 16 && Width == 8)
+        SrcSel = BYTE_2;
+      else if (Offset == 16 && Width == 16)
+        SrcSel = WORD_1;
+      else if (Offset == 24 && Width == 8)
+        SrcSel = BYTE_3;
+      else
+        break;
+
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+      if (TRI->isPhysicalRegister(Src0->getReg()) ||
+          TRI->isPhysicalRegister(Dst->getReg()))
+        break;
+
+      auto SDWASrc = make_unique<SDWASrcOperand>(
+          Src0, Dst, SrcSel, false, false,
+          Opcode == AMDGPU::V_BFE_U32 ? false : true);
+      DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+      SDWAOperands[&MI] = std::move(SDWASrc);
+      ++NumSDWAPatternsFound;
+      break;
+    }
+    case AMDGPU::V_AND_B32_e32: {
+      // e.g.:
+      // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+      // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+      MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+      if (!Src0->isImm())
+        break;
+
+      int64_t Imm = Src0->getImm();
+      if (Imm != 0x0000ffff && Imm != 0x000000ff)
+        break;
+
+      MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+      if (TRI->isPhysicalRegister(Src1->getReg()) ||
+          TRI->isPhysicalRegister(Dst->getReg()))
+        break;
+
+      auto SDWASrc = make_unique<SDWASrcOperand>(
+          Src1, Dst, Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+      DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+      SDWAOperands[&MI] = std::move(SDWASrc);
+      ++NumSDWAPatternsFound;
+      break;
+    }
+    }
+  }
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
+  // Check if this instruction can be converted to SDWA:
+  // 1. Does this opcode support SDWA
+  if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
+    return false;
+
+  // 2. Are all operands - VGPRs
+  for (const MachineOperand &Operand : MI.explicit_operands()) {
+    if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
+      return false;
+  }
+
+  // Convert to sdwa
+  int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
+  assert(SDWAOpcode != -1);
+
+  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
+
+  // Create SDWA version of instruction MI and initialize its operands
+  MachineInstrBuilder SDWAInst =
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (Dst) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
+    SDWAInst.add(*Dst);
+  } else {
+    assert(TII->isVOPC(MI));
+  }
+
+  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
+  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  assert(
+    Src0 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
+  SDWAInst.addImm(0);
+  SDWAInst.add(*Src0);
+
+  // Copy src1 if present, initialize src1_modifiers.
+  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
+    SDWAInst.addImm(0);
+    SDWAInst.add(*Src1);
+  } else {
+    assert(TII->isVOP1(MI));
+  }
+
+  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
+    // v_mac_f16/32 has additional src2 operand tied to vdst
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    assert(Src2);
+    SDWAInst.add(*Src2);
+  }
+
+  // Initialize clamp.
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
+  SDWAInst.addImm(0);
+
+  // Initialize dst_sel and dst_unused if present
+  if (Dst) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+    SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+  }
+
+  // Initialize src0_sel
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
+  SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+
+
+  // Initialize src1_sel if present
+  if (Src1) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Apply all sdwa operand pattenrs
+  bool Converted = false;
+  for (auto &Operand : SDWAOperands) {
+    Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+  }
+  if (!Converted) {
+    SDWAInst->eraseFromParent();
+    return false;
+  }
+
+  DEBUG(dbgs() << "Convert instruction:" << MI
+               << "Into:" << *SDWAInst << '\n');
+  ++NumSDWAInstructionsPeepholed;
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (!ST.hasSDWA() ||
+      !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+    return false;
+  }
+
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+
+  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+
+  // FIXME: For now we only combine instructions in one basic block
+  for (MachineBasicBlock &MBB : MF) {
+    SDWAOperands.clear();
+    matchSDWAOperands(MBB);
+
+    PotentialMatches.clear();
+    for (auto &OperandPair : SDWAOperands) {
+      auto &Operand = OperandPair.second;
+      MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+      if (PotentialMI) {
+        PotentialMatches[PotentialMI].push_back(std::move(Operand));
+      }
+    }
+
+    for (auto &PotentialPair : PotentialMatches) {
+      MachineInstr &PotentialMI = *PotentialPair.first;
+      convertToSDWA(PotentialMI, PotentialPair.second);
+    }
+  }
+  return false;
+}
Index: llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td
@@ -318,7 +318,7 @@
   let SDWA = 1;
   let Uses = [EXEC];
 
-  let SubtargetPredicate = HasSDWA;
+  let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
                                      AMDGPUAsmVariants.Disable);
Index: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -0,0 +1,372 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}add_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_add_i32_sdwa
+
+; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %add = add i32 %a, %shr
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_subrev_i32_sdwa
+
+; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %sub = sub i32 %shr, %a
+  store i32 %sub, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
+  %a = load i32, i32 addrspace(1)* %in1, align 4
+  %b = load i32, i32 addrspace(1)* %in2, align 4
+  %shra = lshr i32 %a, 16
+  %shrb = lshr i32 %b, 16
+  %mul = mul i32 %shra, %shrb
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i16:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
+entry:
+  %a = load i16, i16 addrspace(1)* %ina, align 4
+  %b = load i16, i16 addrspace(1)* %inb, align 4
+  %mul = mul i16 %a, %b
+  store i16 %mul, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}}
+
+define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}}
+
+define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i16> %a, %b
+  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}}
+
+define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
+  %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i16> %a, %b
+  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_half:
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_f16_sdwa
+
+define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
+entry:
+  %a = load half, half addrspace(1)* %ina, align 4
+  %b = load half, half addrspace(1)* %inb, align 4
+  %mul = fmul half %a, %b
+  store half %mul, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
+  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <4 x half> %a, %b
+  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
+  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <8 x half> %a, %b
+  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i8:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
+entry:
+  %a = load i8, i8 addrspace(1)* %ina, align 4
+  %b = load i8, i8 addrspace(1)* %inb, align 4
+  %mul = mul i8 %a, %b
+  store i8 %mul, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+
+define void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
+  %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i8> %a, %b
+  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+
+define void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
+  %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i8> %a, %b
+  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+; SDWA: v_mul_u32_u24_sdwa
+
+define void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
+  %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i8> %a, %b
+  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}mac_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mac_f16_sdwa
+
+; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+
+define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  %mac = fadd <2 x half> %mul, %b
+  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}immediate_mul_v2i16:
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+  %mul = mul <2 x i16> %a, <i16 123, i16 321>
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Double use of same src - should not convert it
+; GCN-LABEL: {{^}}mulmul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  %mul2 = mul <2 x i16> %mul, %b
+  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_add_v2i16:
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; NOSDWA-NOT: v_add_i32_sdwa
+; SDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA-NOT: v_add_i32_sdwa
+
+define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %cond = load i1, i1 addrspace(1)* %incond, align 4
+  br i1 %cond, label %mul_label, label %add_label
+mul_label:
+  %mul = mul <2 x i16> %a, %b
+  br label %store_label
+add_label:
+  %add = add <2 x i16> %a, %b
+  br label %store_label
+store_label:
+  %store = phi <2 x i16> [%mul, %mul_label], [%add, %add_label]
+  store <2 x i16> %store, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
\ No newline at end of file