Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -78,6 +78,7 @@
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -506,6 +506,7 @@
   setTargetDAGCombine(ISD::FABS);
   setTargetDAGCombine(ISD::AssertZext);
   setTargetDAGCombine(ISD::AssertSext);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2771,8 +2772,16 @@
 static SDValue simplifyI24(SDNode *Node24,
                            TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
-  SDValue LHS = Node24->getOperand(0);
-  SDValue RHS = Node24->getOperand(1);
+  bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+  SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+  SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+  unsigned NewOpcode = Node24->getOpcode();
+  if (IsIntrin) {
+    unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+    NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+      AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+  }
 
   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
 
@@ -2782,7 +2791,7 @@
   SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
   SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
   if (DemandedLHS || DemandedRHS)
-    return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+    return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
                        DemandedLHS ? DemandedLHS : LHS,
                        DemandedRHS ? DemandedRHS : RHS);
 
@@ -3020,6 +3029,19 @@
 
   return SDValue();
 }
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  case Intrinsic::amdgcn_mul_i24:
+  case Intrinsic::amdgcn_mul_u24:
+    return simplifyI24(N, DCI);
+  default:
+    return SDValue();
+  }
+}
+
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4108,6 +4130,8 @@
   case ISD::AssertZext:
   case ISD::AssertSext:
     return performAssertSZExtCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicWOChainCombine(N, DCI);
   }
   return SDValue();
 }
Index: test/CodeGen/AMDGPU/mad_int24.ll
===================================================================
--- test/CodeGen/AMDGPU/mad_int24.ll
+++ test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 
@@ -9,8 +9,8 @@
 ; Make sure we aren't masking the inputs.
 ; CM-NOT: AND
 ; CM: MULADD_INT24
-; SI-NOT: and
-; SI: v_mad_i32_i24
+; GCN-NOT: and
+; GCN: v_mad_i32_i24
 define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
@@ -22,3 +22,100 @@
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}mad24_known_bits_destroyed:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_i32_i24
+; GCN-NEXT: v_mul_i32_i24
+; GCN-NEXT: s_setpc_b64
+define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
+
+  %shl.0 = shl i32 %a, 8
+  %sra.0 = ashr i32 %shl.0, 8
+  %shl.1 = shl i32 %b, 8
+  %sra.1 = ashr i32 %shl.1, 8
+
+  %mul0 = mul nsw i32 %sra.0, %sra.1
+  %add0 = add nsw i32 %mul0, %c
+
+  %shl.2 = shl i32 %add0, 8
+  %sra.2 = ashr i32 %shl.2, 8
+
+  %shl.3 = shl i32 %sra.0, 8
+  %sra.3 = ashr i32 %shl.3, 8
+
+  %mul1 = mul nsw i32 %sra.2, %sra.3
+  ret i32 %mul1
+}
+
+; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_i32_i24
+; GCN-NEXT: v_mul_i32_i24
+; GCN-NEXT: s_setpc_b64
+define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
+  %shl.0 = shl i32 %a, 8
+  %sra.0 = ashr i32 %shl.0, 8
+  %shl.1 = shl i32 %b, 8
+  %sra.1 = ashr i32 %shl.1, 8
+
+  %mul0 = call i32 @llvm.amdgcn.mul.i24(i32 %sra.0, i32 %sra.1)
+  %add0 = add nsw i32 %mul0, %c
+
+  %shl.2 = shl i32 %add0, 8
+  %sra.2 = ashr i32 %shl.2, 8
+
+  %shl.3 = shl i32 %sra.0, 8
+  %sra.3 = ashr i32 %shl.3, 8
+
+  %mul1 = mul nsw i32 %sra.2, %sra.3
+  ret i32 %mul1
+}
+
+; Make sure no unnecessary BFEs are emitted in the loop.
+; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2:
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+; GCN: v_mad_i32_i24
+; GCN-NOT: v_bfe
+define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, i32 addrspace(1)* %arg3) {
+bb:
+  br label %bb6
+
+bb5:                                              ; preds = %bb6
+  ret void
+
+bb6:                                              ; preds = %bb6, %bb
+  %tmp = phi i32 [ %tmp27, %bb6 ], [ 0, %bb ]
+  %tmp7 = phi i32 [ %arg2, %bb6 ], [ 1, %bb ]
+  %tmp8 = phi i32 [ %tmp26, %bb6 ], [ %arg, %bb ]
+  %tmp9 = shl i32 %tmp7, 8
+  %tmp10 = ashr exact i32 %tmp9, 8
+  %tmp11 = shl i32 %tmp8, 8
+  %tmp12 = ashr exact i32 %tmp11, 8
+  %tmp13 = mul nsw i32 %tmp12, %tmp10
+  %tmp14 = add nsw i32 %tmp13, %tmp7
+  %tmp15 = shl i32 %tmp14, 8
+  %tmp16 = ashr exact i32 %tmp15, 8
+  %tmp17 = mul nsw i32 %tmp16, %tmp10
+  %tmp18 = add nsw i32 %tmp17, %tmp14
+  %tmp19 = shl i32 %tmp18, 8
+  %tmp20 = ashr exact i32 %tmp19, 8
+  %tmp21 = mul nsw i32 %tmp20, %tmp16
+  %tmp22 = add nsw i32 %tmp21, %tmp18
+  %tmp23 = shl i32 %tmp22, 8
+  %tmp24 = ashr exact i32 %tmp23, 8
+  %tmp25 = mul nsw i32 %tmp24, %tmp20
+  %tmp26 = add nsw i32 %tmp25, %tmp22
+  store i32 %tmp26, i32 addrspace(1)* %arg3
+  %tmp27 = add nuw i32 %tmp, 1
+  %tmp28 = icmp eq i32 %tmp27, %arg1
+  br i1 %tmp28, label %bb5, label %bb6
+}
+
+declare i32 @llvm.amdgcn.mul.i24(i32, i32)