Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -38,6 +38,7 @@
 FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
+FunctionPass *createGCNPeepholeOptimizerPass();
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
@@ -59,6 +60,9 @@
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
+void initializeGCNPeepholeOptimizerPass(PassRegistry &);
+extern char &GCNPeepholeOptimizerID;
+
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -372,6 +372,7 @@
 }
 
 void GCNPassConfig::addPreEmitPass() {
+  addPass(createGCNPeepholeOptimizerPass(), false);
   addPass(createSIInsertWaitsPass(), false);
   addPass(createSILowerControlFlowPass(), false);
   if (InsertNops) {
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -33,6 +33,7 @@
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  GCNPeepholeOptimizer.cpp
   R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
Index: lib/Target/AMDGPU/GCNPeepholeOptimizer.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/GCNPeepholeOptimizer.cpp
@@ -0,0 +1,120 @@
+//===-- GCNPeepholeOptimizer.cpp - Peephole optimizations for GCN ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This contains some peeophole optimizations for GCN targets.  Once
+/// GlobalISel is finished, this pass may be used as a replacement for our
+/// target dag combines.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-peephole"
+
+namespace {
+
+class GCNPeepholeOptimizer : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  MachineBasicBlock::iterator optimizeS_NOP(MachineBasicBlock::iterator I) const;
+
+public:
+  static char ID;
+
+  GCNPeepholeOptimizer() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "GCN Peephole optimizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char GCNPeepholeOptimizer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GCNPeepholeOptimizer, DEBUG_TYPE,
+                      "GCN Peephole Optimizer", false, false)
+INITIALIZE_PASS_END(GCNPeepholeOptimizer, DEBUG_TYPE,
+                      "GCN Peephole Optimizer", false, false)
+
+char &llvm::GCNPeepholeOptimizerID = GCNPeepholeOptimizer::ID;
+
+FunctionPass *llvm::createGCNPeepholeOptimizerPass() {
+  return new GCNPeepholeOptimizer;
+}
+
+/// Combine consecutive S_NOP instructions into as few as possible.
+MachineBasicBlock::iterator GCNPeepholeOptimizer::optimizeS_NOP(MachineBasicBlock::iterator I) const {
+  MachineBasicBlock::iterator Nop = std::next(I);
+  MachineBasicBlock *MBB = I->getParent();
+  MachineBasicBlock::iterator End = MBB->end();
+
+  // Bail early if we don't have consecutive S_NOPs.
+  if (Nop == End || Nop->getOpcode() != AMDGPU::S_NOP)
+    return Nop;
+
+  unsigned NopCount = 0;
+
+  MachineBasicBlock::iterator Next;
+  for (Nop = I; Nop->getOpcode() == AMDGPU::S_NOP && Nop != End; Nop = Next) {
+    Next = std::next(Nop);
+
+    NopCount += 1 + Nop->getOperand(0).getImm();
+    Nop->eraseFromParent();
+  }
+
+  TII->insertWaitStates(*MBB, Nop, NopCount);
+  return Nop;
+}
+
+bool GCNPeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+      if (MI.getOpcode() == AMDGPU::S_NOP) {
+        Next = optimizeS_NOP(I);
+        continue;
+      }
+    }
+  }
+  return true;
+}