Index: llvm/trunk/include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/SelectionDAG.h
+++ llvm/trunk/include/llvm/CodeGen/SelectionDAG.h
@@ -740,6 +740,9 @@
     return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
   }
 
+  /// Return true if the result of this operation is always undefined.
+  bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);
+
   /// Return an UNDEF node. UNDEF does not have a useful SDLoc.
   SDValue getUNDEF(EVT VT) {
     return getNode(ISD::UNDEF, SDLoc(), VT);
Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2524,15 +2524,7 @@
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  // X / undef -> undef
-  // X % undef -> undef
-  if (N1.isUndef())
-    return N1;
-
-  // X / 0 --> undef
-  // X % 0 --> undef
-  // We don't need to preserve faults!
-  if (isNullConstantOrNullSplatConstant(N1))
+  if (DAG.isUndef(N->getOpcode(), {N0, N1}))
     return DAG.getUNDEF(VT);
 
   // undef / X -> 0
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3695,12 +3695,6 @@
   if (Cst1->isOpaque() || Cst2->isOpaque())
     return SDValue();
 
-  // Division/remainder with a zero divisor is undefined behavior.
-  if ((Opcode == ISD::SDIV || Opcode == ISD::UDIV ||
-       Opcode == ISD::SREM || Opcode == ISD::UREM) &&
-      Cst2->isNullValue())
-    return getUNDEF(VT);
-
   std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
                                             Cst2->getAPIntValue());
   if (!Folded.second)
@@ -3728,6 +3722,30 @@
                           GA->getOffset() + uint64_t(Offset));
 }
 
+bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    // If a divisor is zero/undef or any element of a divisor vector is
+    // zero/undef, the whole op is undef.
+    assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+    SDValue Divisor = Ops[1];
+    if (Divisor.isUndef() || isNullConstant(Divisor))
+      return true;
+
+    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+           any_of(Divisor->op_values(),
+                  [](SDValue V) { return V.isUndef() || isNullConstant(V); });
+    // TODO: Handle signed overflow.
+  }
+  // TODO: Handle oversized shifts.
+  default:
+    return false;
+  }
+}
+
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, SDNode *Cst1,
                                              SDNode *Cst2) {
@@ -3737,6 +3755,9 @@
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
+    return getUNDEF(VT);
+
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
@@ -3804,6 +3825,9 @@
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, Ops))
+    return getUNDEF(VT);
+
   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
   if (!VT.isVector())
     return SDValue();
Index: llvm/trunk/test/CodeGen/X86/div-rem-simplify.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/div-rem-simplify.ll
+++ llvm/trunk/test/CodeGen/X86/div-rem-simplify.ll
@@ -152,7 +152,6 @@
 define <4 x i32> @sdiv0elt_vec(<4 x i32> %x) {
 ; CHECK-LABEL: sdiv0elt_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,12,u,4294967292>
 ; CHECK-NEXT:    retq
   %zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
   %some_ones = or <4 x i32> %zero, <i32 0, i32 -1, i32 0, i32 3>
@@ -163,7 +162,6 @@
 define <4 x i32> @udiv0elt_vec(<4 x i32> %x) {
 ; CHECK-LABEL: udiv0elt_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,4,3,u>
 ; CHECK-NEXT:    retq
   %div = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, <i32 0, i32 3, i32 4, i32 0>
   ret <4 x i32> %div
@@ -172,7 +170,6 @@
 define <4 x i32> @urem0elt_vec(<4 x i32> %x) {
 ; CHECK-LABEL: urem0elt_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,2>
 ; CHECK-NEXT:    retq
   %zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
   %some_ones = or <4 x i32> %zero, <i32 0, i32 0, i32 0, i32 3>
@@ -183,8 +180,6 @@
 define <4 x i32> @srem0elt_vec(<4 x i32> %x) {
 ; CHECK-LABEL: srem0elt_vec:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    movl $-2, %eax
-; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retq
   %rem = srem <4 x i32> <i32 -11, i32 -12, i32 -13, i32 -14>, <i32 -3, i32 -3, i32 0, i32 2>
   ret <4 x i32> %rem
Index: llvm/trunk/test/CodeGen/X86/vec_sdiv_to_shift.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ llvm/trunk/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -184,27 +184,15 @@
   ret <16 x i16> %a0
 }
 
-; TODO: The div-by-0 lanes are folded away, so we use scalar ops. Would it be better to keep this in the vector unit?
+; Div-by-0 in any lane is UB.
 
 define <4 x i32> @sdiv_non_splat(<4 x i32> %x) {
 ; SSE-LABEL: sdiv_non_splat:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl $31, %ecx
-; SSE-NEXT:    addl %eax, %ecx
-; SSE-NEXT:    sarl %ecx
-; SSE-NEXT:    movd %ecx, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sdiv_non_splat:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    shrl $31, %ecx
-; AVX-NEXT:    addl %eax, %ecx
-; AVX-NEXT:    sarl %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm0
 ; AVX-NEXT:    retq
   %y = sdiv <4 x i32> %x, <i32 2, i32 0, i32 0, i32 0>
   ret <4 x i32> %y