Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -84,7 +84,8 @@
 
 enum {
   StateWQM = 0x1,
-  StateExact = 0x2,
+  StateWWM = 0x2,
+  StateExact = 0x4,
 };
 
 struct PrintState {
@@ -97,9 +98,14 @@
 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
   if (PS.State & StateWQM)
     OS << "WQM";
-  if (PS.State & StateExact) {
+  if (PS.State & StateWWM) {
     if (PS.State & StateWQM)
       OS << '|';
+    OS << "WWM";
+  }
+  if (PS.State & StateExact) {
+    if (PS.State & (StateWQM | StateWWM))
+      OS << '|';
     OS << "Exact";
   }
 
@@ -128,6 +134,7 @@
 
 class SIWholeQuadMode : public MachineFunctionPass {
 private:
+  CallingConv::ID callingConv;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -141,7 +148,8 @@
 
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
-  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void markInstructionUses(const MachineInstr &MI, char Flag,
+                           std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -159,6 +167,10 @@
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SavedWQM);
+  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SaveOrig);
+  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SavedOrig);
   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 
   void lowerLiveMaskQueries(unsigned LiveMaskReg);
@@ -218,7 +230,7 @@
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
 
-  assert(Flag == StateWQM || Flag == StateExact);
+  assert(Flag == StateWQM || Flag == StateWWM || Flag == StateExact);
 
   // Ignore if the instruction is already marked. The typical case is that we
   // mark an instruction WQM multiple times, but for atomics it can happen that
@@ -231,9 +243,9 @@
   Worklist.push_back(&MI);
 }
 
-/// Mark all instructions defining the uses in \p MI as WQM.
-void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
-                                  std::vector<WorkItem> &Worklist) {
+/// Mark all instructions defining the uses in \p MI with the given Flag.
+void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
+                                          std::vector<WorkItem> &Worklist) {
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
@@ -258,7 +270,7 @@
         if (Value->isPHIDef())
           continue;
 
-        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
                         Worklist);
       }
 
@@ -266,7 +278,7 @@
     }
 
     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, StateWQM, Worklist);
+      markInstruction(DefMI, Flag, Worklist);
   }
 }
 
@@ -285,13 +297,23 @@
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode) && callingConv == CallingConv::AMDGPU_PS) {
         Flags = StateWQM;
+      } else if (TII->isDPP(Opcode)) {
+        unsigned wqmType = MI.getOperand(1).getImm();
+        if (wqmType == 0) {
+          Flags = StateExact;
+        } else if (wqmType == 1) {
+          Flags = StateWQM;
+        } else {
+          assert(wqmType == 2);
+          Flags = StateWWM;
+        }
       } else if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
-        markUsesWQM(MI, Worklist);
+        markInstructionUses(MI, StateWQM, Worklist);
         GlobalFlags |= StateWQM;
         continue;
       } else if (TII->isDisableWQM(MI)) {
@@ -345,14 +367,14 @@
 
   // Propagate to block level
   BI.Needs |= II.Needs;
-  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
-    BI.InNeeds |= II.Needs;
+  if ((BI.InNeeds | (II.Needs & ~StateWWM)) != BI.InNeeds) {
+    BI.InNeeds |= (II.Needs & ~StateWWM);
     Worklist.push_back(MBB);
   }
 
   // Propagate backwards within block
   if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
+    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
     if (!PrevMI->isPHI()) {
       InstrInfo &PrevII = Instructions[PrevMI];
       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@@ -362,11 +384,11 @@
     }
   }
 
-  // Propagate WQM flag to instruction inputs
+  // Propagate WQM and WWM flags to instruction inputs
   assert(II.Needs != (StateWQM | StateExact));
 
-  if (II.Needs == StateWQM)
-    markUsesWQM(MI, Worklist);
+  if (II.Needs != 0 && II.Needs != StateExact)
+    markInstructionUses(MI, II.Needs, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -558,6 +580,33 @@
   LIS->InsertMachineInstrInMaps(*MI);
 }
 
+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SaveOrig)
+{
+  MachineInstr *MI;
+
+  assert(SaveOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveOrig)
+           .addReg(AMDGPU::EXEC);
+  LIS->InsertMachineInstrInMaps(*MI);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+           .addImm(-1);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SavedOrig)
+{
+  MachineInstr *MI;
+
+  assert(SavedOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+           .addReg(SavedOrig);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
                                    bool isEntry) {
   auto BII = Blocks.find(&MBB);
@@ -566,19 +615,18 @@
 
   const BlockInfo &BI = BII->second;
 
-  if (!(BI.InNeeds & StateWQM))
-    return;
-
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
     return;
 
   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
 
   unsigned SavedWQMReg = 0;
+  unsigned SavedNonWWMReg = 0;
   bool WQMFromExec = isEntry;
   char State = isEntry ? StateExact : StateWQM;
+  char NonWWMState = 0;
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   if (isEntry)
@@ -625,7 +673,16 @@
             prepareInsertion(MBB, First, II, Needs == StateWQM,
                              Needs == StateExact || WQMFromExec);
 
-        if (Needs == StateExact) {
+        if (State == StateWWM) {
+          State = NonWWMState;
+          fromWWM(MBB, Before, SavedNonWWMReg);
+        }
+
+        if (Needs == StateWWM) {
+          NonWWMState = State;
+          SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+          toWWM(MBB, Before, SavedNonWWMReg);
+        } else if (Needs == StateExact) {
           if (!WQMFromExec && (OutNeeds & StateWQM))
             SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
@@ -667,12 +724,10 @@
 }
 
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
-    return false;
-
   Instructions.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
+  callingConv = MF.getFunction()->getCallingConv();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
@@ -682,7 +737,7 @@
   LIS = &getAnalysis<LiveIntervals>();
 
   char GlobalFlags = analyzeFunction(MF);
-  if (!(GlobalFlags & StateWQM)) {
+  if (!(GlobalFlags & (StateWQM | StateWWM))) {
     lowerLiveMaskQueries(AMDGPU::EXEC);
     return !LiveMaskQueries.empty();
   }