diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1940,6 +1940,8 @@
 
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
 
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 4 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 4 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i16,  1 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v4i32,  1 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v2i64,  1 },
@@ -2015,6 +2017,8 @@
     { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i64,  9 },
     { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i64, 11 },
 
+    { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i32, 6 },
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // and+extract+packuswb
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v8i32,  5 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  5 },
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
--- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -81,8 +81,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -100,8 +100,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -214,8 +214,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -315,8 +315,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -334,8 +334,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -448,8 +448,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 114 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 106 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -1037,8 +1037,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1056,8 +1056,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1170,8 +1170,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1275,8 +1275,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1294,8 +1294,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1408,8 +1408,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -167,8 +167,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'zext_sext'
@@ -197,8 +197,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'zext_sext'
diff --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
--- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -218,8 +218,8 @@
 ; VEC256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A = trunc <8 x i64> undef to <8 x i32>
 ; VEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %B = trunc <8 x i64> undef to <8 x i16>
 ; VEC256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %C = trunc <8 x i64> undef to <8 x i8>
-; VEC256-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %D = trunc <16 x i32> undef to <16 x i16>
-; VEC256-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = trunc <16 x i32> undef to <16 x i8>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = trunc <16 x i32> undef to <16 x i16>
+; VEC256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = trunc <16 x i32> undef to <16 x i8>
 ; VEC256-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <32 x i16> undef to <32 x i8>
 ; VEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll
--- a/llvm/test/Analysis/CostModel/X86/trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc.ll
@@ -117,8 +117,8 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'trunc_vXi16'
@@ -132,8 +132,8 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'trunc_vXi16'
@@ -162,8 +162,8 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i16>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i16>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i16
@@ -267,9 +267,9 @@
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
@@ -291,9 +291,9 @@
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
@@ -363,9 +363,9 @@
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i8>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i8>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i8>
diff --git a/mlir/include/mlir/Analysis/Presburger/Simplex.h b/mlir/include/mlir/Analysis/Presburger/Simplex.h
--- a/mlir/include/mlir/Analysis/Presburger/Simplex.h
+++ b/mlir/include/mlir/Analysis/Presburger/Simplex.h
@@ -237,6 +237,13 @@
   void print(raw_ostream &os) const;
   void dump() const;
 
+  /// Check if the specified inequality already holds in the polytope.
+  bool isRedundant(ArrayRef<int64_t> coeffs);
+
+  /// Check if the specified FAC contains the polytope,
+  /// i.e. whether all inequalities of the specified FAC hold for the polytope.
+  bool containedIn(const FlatAffineConstraints &fac);
+
 private:
   friend class GBRSimplex;
 
diff --git a/mlir/include/mlir/Analysis/PresburgerSet.h b/mlir/include/mlir/Analysis/PresburgerSet.h
--- a/mlir/include/mlir/Analysis/PresburgerSet.h
+++ b/mlir/include/mlir/Analysis/PresburgerSet.h
@@ -94,6 +94,12 @@
   /// any of the FACs in the union are unbounded.
   bool findIntegerSample(SmallVectorImpl<int64_t> &sample);
 
+  /// Simplifies the representation of a PresburgerSet.
+  ///
+  /// Inparticular, removes all FACs which are subset of other FACs in the
+  /// union.
+  PresburgerSet coalesce();
+
 private:
   /// Construct an empty PresburgerSet.
   PresburgerSet(unsigned nDim = 0, unsigned nSym = 0)
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -1240,4 +1240,25 @@
 
 void Simplex::dump() const { print(llvm::errs()); }
 
+bool Simplex::containedIn(const FlatAffineConstraints &fac) {
+
+  if (isEmpty())
+    return true;
+
+  for (unsigned i = 0, e = fac.getNumInequalities(); i < e; ++i)
+    if (!isRedundant(fac.getInequality(i)))
+      return false;
+
+  for (unsigned i = 0, e = fac.getNumEqualities(); i < e; ++i)
+    if (!isRedundant(fac.getEquality(i)))
+      return false;
+
+  return true;
+}
+
+bool Simplex::isRedundant(ArrayRef<int64_t> coeffs) {
+  Optional<Fraction> minimum = computeOptimum(Direction::Down, coeffs);
+  return minimum && *minimum >= Fraction(0, 1);
+}
+
 } // namespace mlir
diff --git a/mlir/lib/Analysis/PresburgerSet.cpp b/mlir/lib/Analysis/PresburgerSet.cpp
--- a/mlir/lib/Analysis/PresburgerSet.cpp
+++ b/mlir/lib/Analysis/PresburgerSet.cpp
@@ -381,6 +381,39 @@
   return false;
 }
 
+PresburgerSet PresburgerSet::coalesce() {
+  PresburgerSet newSet =
+      PresburgerSet::getEmptySet(this->getNumDims(), this->getNumSyms());
+  SmallVector<bool, 4> marked(this->getNumFACs());
+
+  for (unsigned i = 0, e = flatAffineConstraints.size(); i < e; i++) {
+    if (marked[i])
+      continue;
+    Simplex simplex(flatAffineConstraints[i]);
+
+    if (simplex.isEmpty()) {
+      marked[i] = true;
+      continue;
+    }
+    // check if bs1 is contained in any basicSet
+    for (unsigned j = 0, e = flatAffineConstraints.size(); j < e; ++j) {
+      if (j == i || marked[j])
+        continue;
+
+      if (simplex.containedIn(flatAffineConstraints[j])) {
+        marked[i] = true;
+        break;
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = flatAffineConstraints.size(); i < e; ++i) {
+    if (!marked[i])
+      newSet.unionFACInPlace(flatAffineConstraints[i]);
+  }
+  return newSet;
+}
+
 void PresburgerSet::print(raw_ostream &os) const {
   os << getNumFACs() << " FlatAffineConstraints:\n";
   for (const FlatAffineConstraints &fac : flatAffineConstraints) {
diff --git a/mlir/unittests/Analysis/FACUtils.h b/mlir/unittests/Analysis/FACUtils.h
new file mode 100644
--- /dev/null
+++ b/mlir/unittests/Analysis/FACUtils.h
@@ -0,0 +1,41 @@
+//===- FACUtils.h - utility functions for testing -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineStructures.h"
+
+namespace mlir {
+
+/// Construct a FlatAffineConstraints from a set of inequality and
+/// equality constraints. `numIds` is the total number of ids, of which
+/// `numLocals` is the number of local ids.
+static FlatAffineConstraints
+makeFACFromConstraints(unsigned numIds, ArrayRef<SmallVector<int64_t, 4>> ineqs,
+                       ArrayRef<SmallVector<int64_t, 4>> eqs,
+                       unsigned numLocals = 0) {
+  FlatAffineConstraints fac(/*numReservedInequalities=*/ineqs.size(),
+                            /*numReservedEqualities=*/eqs.size(),
+                            /*numReservedCols=*/numIds + 1,
+                            /*numDims=*/numIds - numLocals,
+                            /*numSymbols=*/0, numLocals);
+  for (const SmallVector<int64_t, 4> &eq : eqs)
+    fac.addEquality(eq);
+  for (const SmallVector<int64_t, 4> &ineq : ineqs)
+    fac.addInequality(ineq);
+  return fac;
+}
+
+/// Construct a FlatAffineConstraints having `numDims` dimensions from the given
+/// set of inequality constraints. This is a convenience function to be used
+/// when the FAC to be constructed does not have any local ids and does not have
+/// equalties.
+static FlatAffineConstraints
+makeFACFromIneqs(unsigned dims, ArrayRef<SmallVector<int64_t, 4>> ineqs) {
+  return makeFACFromConstraints(dims, ineqs, {});
+}
+
+} // namespace mlir
diff --git a/mlir/unittests/Analysis/Presburger/CMakeLists.txt b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
--- a/mlir/unittests/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
@@ -4,5 +4,5 @@
 )
 
 target_link_libraries(MLIRPresburgerTests
-  PRIVATE MLIRPresburger)
-
+  PRIVATE MLIRPresburger
+  MLIRLoopAnalysis)
diff --git a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
--- a/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/SimplexTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/Simplex.h"
+#include "../FACUtils.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -410,4 +411,62 @@
   EXPECT_EQ(simplex.getNumConstraints(), 0u);
 }
 
+TEST(SimplexTest, isRedundant) {
+  Simplex simplex(2);
+  simplex.addInequality({0, -1, 2}); // [0]: y <= 2.
+  simplex.addInequality({1, 0, 0});  // [1]: x >= 0.
+  simplex.addEquality({-1, 1, 0});   // [2]: y = x.
+
+  EXPECT_TRUE(simplex.isRedundant({-1, 0, 2})); // x <= 2.
+  EXPECT_TRUE(simplex.isRedundant({0, 1, 0}));  // y >= 0.
+
+  EXPECT_FALSE(simplex.isRedundant({-1, 0, -1})); // x <= -1.
+  EXPECT_FALSE(simplex.isRedundant({0, 1, -2}));  // y >= 2.
+  EXPECT_FALSE(simplex.isRedundant({0, 1, -1}));  // y >= 1.
+}
+
+TEST(SimplexTest, ContainedIn) {
+
+  FlatAffineConstraints univ = FlatAffineConstraints::getUniverse(1, 0);
+
+  FlatAffineConstraints empty = makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                     {-1, -1}}); // x <= -1.
+
+  FlatAffineConstraints s1 = makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 4}}); // x <= 4.
+
+  FlatAffineConstraints s2 = makeFACFromIneqs(1, {{1, -1},   // x >= 1.
+                                                  {-1, 3}}); // x <= 3.
+
+  Simplex simUniv(univ);
+  Simplex simEmpty(empty);
+  Simplex sim1(s1);
+  Simplex sim2(s2);
+
+  EXPECT_TRUE(simUniv.containedIn(univ));
+  EXPECT_TRUE(simEmpty.containedIn(empty));
+  EXPECT_TRUE(sim1.containedIn(s1));
+  EXPECT_TRUE(sim2.containedIn(s2));
+
+  EXPECT_TRUE(simEmpty.containedIn(univ));
+  EXPECT_TRUE(simEmpty.containedIn(s1));
+  EXPECT_TRUE(simEmpty.containedIn(s2));
+  EXPECT_TRUE(simEmpty.containedIn(empty));
+
+  EXPECT_TRUE(simUniv.containedIn(univ));
+  EXPECT_FALSE(simUniv.containedIn(s1));
+  EXPECT_FALSE(simUniv.containedIn(s2));
+  EXPECT_FALSE(simUniv.containedIn(empty));
+
+  EXPECT_TRUE(sim1.containedIn(univ));
+  EXPECT_TRUE(sim1.containedIn(s1));
+  EXPECT_FALSE(sim1.containedIn(s2));
+  EXPECT_FALSE(sim1.containedIn(empty));
+
+  EXPECT_TRUE(sim2.containedIn(univ));
+  EXPECT_TRUE(sim2.containedIn(s1));
+  EXPECT_TRUE(sim2.containedIn(s2));
+  EXPECT_FALSE(sim2.containedIn(empty));
+}
+
 } // namespace mlir
diff --git a/mlir/unittests/Analysis/PresburgerSetTest.cpp b/mlir/unittests/Analysis/PresburgerSetTest.cpp
--- a/mlir/unittests/Analysis/PresburgerSetTest.cpp
+++ b/mlir/unittests/Analysis/PresburgerSetTest.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/PresburgerSet.h"
+#include "./FACUtils.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -79,34 +80,6 @@
   }
 }
 
-/// Construct a FlatAffineConstraints from a set of inequality and
-/// equality constraints. `numIds` is the total number of ids, of which
-/// `numLocals` is the number of local ids.
-static FlatAffineConstraints
-makeFACFromConstraints(unsigned numIds, ArrayRef<SmallVector<int64_t, 4>> ineqs,
-                       ArrayRef<SmallVector<int64_t, 4>> eqs,
-                       unsigned numLocals = 0) {
-  FlatAffineConstraints fac(/*numReservedInequalities=*/ineqs.size(),
-                            /*numReservedEqualities=*/eqs.size(),
-                            /*numReservedCols=*/numIds + 1,
-                            /*numDims=*/numIds - numLocals,
-                            /*numSymbols=*/0, numLocals);
-  for (const SmallVector<int64_t, 4> &eq : eqs)
-    fac.addEquality(eq);
-  for (const SmallVector<int64_t, 4> &ineq : ineqs)
-    fac.addInequality(ineq);
-  return fac;
-}
-
-/// Construct a FlatAffineConstraints having `numDims` dimensions from the given
-/// set of inequality constraints. This is a convenience function to be used
-/// when the FAC to be constructed does not have any local ids and does not have
-/// equalties.
-static FlatAffineConstraints
-makeFACFromIneqs(unsigned numDims, ArrayRef<SmallVector<int64_t, 4>> ineqs) {
-  return makeFACFromConstraints(numDims, ineqs, /*eqs=*/{});
-}
-
 /// Construct a PresburgerSet having `numDims` dimensions and no symbols from
 /// the given list of FlatAffineConstraints. Each FAC in `facs` should also have
 /// `numDims` dimensions and no symbols, although it can have any number of
@@ -638,4 +611,173 @@
   expectEqual(multiples3.intersect(evens), multiples6);
 }
 
+void expectCoalesce(size_t expectedNumBasicSets, PresburgerSet set) {
+  PresburgerSet newSet = set.coalesce();
+  if (!set.isEqual(newSet) || !(expectedNumBasicSets == newSet.getNumFACs())) {
+    set.dump();
+    newSet.dump();
+  }
+  EXPECT_TRUE(set.isEqual(newSet));
+  EXPECT_TRUE(expectedNumBasicSets == newSet.getNumFACs());
+}
+
+TEST(SetTest, coalesceContainedOneDim) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 4}}), // x <= 4.
+                             makeFACFromIneqs(1, {{1, -1},   // x >= 1.
+                                                  {-1, 2}}), // x <= 2.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceFirstEmpty) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                  {-1, -1}}), // x <= -1.
+                             makeFACFromIneqs(1, {{1, -1},    // x >= 1.
+                                                  {-1, 2}}),  // x <= 2.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceSecondEmpty) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, -1},    // x >= 1.
+                                                  {-1, 2}}),  // x <= 2.
+                             makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                  {-1, -1}}), // x <= -1.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceBothEmpty) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, -3},    // x >= 3.
+                                                  {-1, -1}}), // x <= -1.
+                             makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                  {-1, -1}}), // x <= -1.
+                         });
+  expectCoalesce(0, set);
+}
+
+TEST(SetTest, coalesceFirstUniv) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {}),
+                             makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 1}}), // x <= 1.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceSecondUniv) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 1}}), // x <= 1.
+                             makeFACFromIneqs(1, {}),
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceBothUniv) {
+  PresburgerSet set = makeSetFromFACs(1, {
+                                             makeFACFromIneqs(1, {}),
+                                             makeFACFromIneqs(1, {}),
+                                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceFirstUnivSecondEmpty) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {}),
+                             makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                  {-1, -1}}), // x <= -1.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceFirstEmptySecondUniv) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {}),
+                             makeFACFromIneqs(1, {{1, 0},     // x >= 0.
+                                                  {-1, -1}}), // x <= -1.
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceCutOneDim) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 3}}), // x <= 3.
+                             makeFACFromIneqs(1, {{1, -2},   // x >= 2.
+                                                  {-1, 4}}), // x <= 4.
+                         });
+  expectCoalesce(2, set);
+}
+
+TEST(SetTest, coalesceSeparateOneDim) {
+  PresburgerSet set =
+      makeSetFromFACs(1, {
+                             makeFACFromIneqs(1, {{1, 0},    // x >= 0.
+                                                  {-1, 2}}), // x <= 2.
+                             makeFACFromIneqs(1, {{1, -3},   // x >= 3.
+                                                  {-1, 4}}), // x <= 4.
+                         });
+  expectCoalesce(2, set);
+}
+
+TEST(SetTest, coalesceContainedTwoDim) {
+  PresburgerSet set =
+      makeSetFromFACs(2, {
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, 0},    // y >= 0
+                                                  {0, -1, 3}}), // y <= 3.
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, -2},   // y >= 2.
+                                                  {0, -1, 3}}), // y <= 3
+                         });
+  expectCoalesce(1, set);
+}
+
+TEST(SetTest, coalesceCutTwoDim) {
+  PresburgerSet set =
+      makeSetFromFACs(2, {
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, 0},    // y >= 0
+                                                  {0, -1, 2}}), // y <= 2.
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, -1},   // y >= 1.
+                                                  {0, -1, 3}}), // y <= 3
+                         });
+  expectCoalesce(2, set);
+}
+
+TEST(SetTest, coalesceSeparateTwoDim) {
+  PresburgerSet set =
+      makeSetFromFACs(2, {
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, 0},    // y >= 0
+                                                  {0, -1, 1}}), // y <= 1.
+                             makeFACFromIneqs(2, {{1, 0, 0},    // x >= 0.
+                                                  {-1, 0, 3},   // x <= 3.
+                                                  {0, 1, -2},   // y >= 2.
+                                                  {0, -1, 3}}), // y <= 3
+                         });
+  expectCoalesce(2, set);
+}
+
 } // namespace mlir