Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1554,6 +1554,51 @@
   return false;
 }
 
+// Convert Cmp(Const,a) into canonical representation Cmp(a,Const)
+// This function creates a canonical compare by reordering its operands if needed.
+// as a result, the compare direction may also be changed in order to preserve the original compare meaning.
+// Example(a < b) -> (b > a)
+// Otherwise, the comparison value is still valid for the new operands order.
+// Example(a == b) -> (b == a)
+// Target must have VEX encoding!!!
+
+static bool X86CreateCanonicalCMP(IntrinsicInst *II) {
+  Value *LHS = II->getOperand(0);
+  Value *RHS = II->getOperand(1);
+  if (isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+    ConstantInt *ComparisonValue = cast<ConstantInt>(II->getOperand(2));
+    uint64_t ConstantValue = ComparisonValue->getZExtValue();
+    // When the lower bits of the compare are "01" or "10" (e.g. "1" or "2"),
+    // they represent a "direction" ('<','<='...) types of comparisons
+    // for which we need to change the direction of the compare when operators
+    // are exchanged.
+    //
+    // In the 128-bit Legacy SSE version: The comparison predicate operand is an 8 -
+    // bit immediate, bits 2:0 of the immediate define the type of comparison to be
+    // performed. Bits 7 : 3 of the immediate is reserved.
+    //
+    // The three first bits represent:
+    // equal, less than ,Less-than-or-equal ,Unordered, Not-equal,
+    // Not less than , Not Less-than-or-equal and Ordered.
+    //
+    // In the VEX version: Two more bits were added to the immediate. For the
+    // "relation" types (<,<=,!<,!<=) the fourth bit represent the "greater relation".
+    // By using "Bit flipping" operation on the 4 low bits of the "relation" types.
+    // We are creating a one to one match between less to the equivalent upside greater.
+    // ( xor(immediate('<'),0x0F) = immediate('>'))
+    // This behaviour is true also for immediate with 0X10h.
+    if ((ConstantValue & 0x3) == 1 || (ConstantValue & 0x3) == 2) {
+      const APInt NewComparison(32, (ConstantValue ^ 0xf));
+      II->setOperand(2, ConstantExpr::getIntegerValue(
+                           Type::getInt32Ty(II->getContext()), NewComparison));
+    }
+    II->setArgOperand(0, RHS);
+    II->setArgOperand(1, LHS);
+    return true;
+  }
+  return false;
+}
+
 // Convert NVVM intrinsics to target-generic LLVM code where possible.
 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // Each NVVM intrinsic we can simplify can be replaced with one of:
@@ -2290,6 +2335,11 @@
     break;
   }
 
+  case Intrinsic::x86_avx512_mask_cmp_ss:
+  case Intrinsic::x86_avx512_mask_cmp_sd:
+    if (X86CreateCanonicalCMP(II))
+      return II;
+    LLVM_FALLTHROUGH;
   case Intrinsic::x86_sse_comieq_ss:
   case Intrinsic::x86_sse_comige_ss:
   case Intrinsic::x86_sse_comigt_ss:
@@ -2315,9 +2365,7 @@
   case Intrinsic::x86_sse2_ucomilt_sd:
   case Intrinsic::x86_sse2_ucomineq_sd:
   case Intrinsic::x86_avx512_vcomi_ss:
-  case Intrinsic::x86_avx512_vcomi_sd:
-  case Intrinsic::x86_avx512_mask_cmp_ss:
-  case Intrinsic::x86_avx512_mask_cmp_sd: {
+  case Intrinsic::x86_avx512_vcomi_sd: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
     bool MadeChange = false;
@@ -2336,6 +2384,15 @@
       return II;
     break;
   }
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512:
+    if(X86CreateCanonicalCMP(II))
+      return II;
+    break;
 
   case Intrinsic::x86_avx512_mask_add_ps_512:
   case Intrinsic::x86_avx512_mask_div_ps_512:
@@ -2445,6 +2502,11 @@
     LLVM_FALLTHROUGH;
 
   // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts.
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse_cmp_ss:
+    if (X86CreateCanonicalCMP(II))
+      return II;
+    LLVM_FALLTHROUGH;
   case Intrinsic::x86_avx512_mask_max_ss_round:
   case Intrinsic::x86_avx512_mask_min_ss_round:
   case Intrinsic::x86_avx512_mask_max_sd_round:
@@ -2467,10 +2529,8 @@
   case Intrinsic::x86_fma_vfmsub_sd:
   case Intrinsic::x86_fma_vfnmadd_sd:
   case Intrinsic::x86_fma_vfnmsub_sd:
-  case Intrinsic::x86_sse_cmp_ss:
   case Intrinsic::x86_sse_min_ss:
   case Intrinsic::x86_sse_max_ss:
-  case Intrinsic::x86_sse2_cmp_sd:
   case Intrinsic::x86_sse2_min_sd:
   case Intrinsic::x86_sse2_max_sd:
   case Intrinsic::x86_sse41_round_ss:
Index: test/Transforms/InstCombine/X86CanonicCmp.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/X86CanonicCmp.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; This test checks the transformation of compare into canonical compare.
+
+define zeroext i16 @fucntionTets(<2 x double> %a, <4 x double> %b, <8 x double> %c, <4 x float> %d, <8 x float> %e, <16 x float> %f) local_unnamed_addr #0 {
+entry:
+  ; CHECK: %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> <double 1.500000e+00, double 3.000000e-01>, i32 1, i8 -1)
+  ; CHECK: %1 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> zeroinitializer, i32 14, i8 -1)
+  ; CHECK: %2 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %b, <4 x double> zeroinitializer, i32 10, i8 -1)
+  ; CHECK: %3 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)
+  ; CHECK: %4 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %d, <4 x float> zeroinitializer, i32 12, i8 -1)
+  ; CHECK: %5 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %e, <8 x float> zeroinitializer, i32 15, i8 -1)
+  ; CHECK: %6 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %a, <2 x double> <double 0.000000e+00, double undef>, i32 16, i8 -1, i32 4)
+  ; CHECK: %7 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %f, <16 x float> zeroinitializer, i32 0, i16 -1, i32 4)
+
+  ; Checking that CMP with two constant comparison operands is unchanged
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> <double 1.5,double 0.3>, i32 1, i8 -1)
+
+  ; Checking that the canonical representation is working and converts the Cmp(const, a) into Cmp(a, const) with correct comparison immediate.
+  %1 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %a, i32 1, i8 -1)
+  %2 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %b, i32 5, i8 -1)
+  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %c, i32 11, i8 -1, i32 4)
+  %4 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %d, i32 12, i8 -1)
+  %5 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %e, i32 15, i8 -1)
+  %6 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> zeroinitializer, <2 x double> %a, i32 16, i8 -1, i32 4)
+
+  ; Checking that the canonical representation is working and converts the Cmp(const, a) into Cmp(a, const) without any change on comparison immediate.
+  %7 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %f, i32 0, i16 -1, i32 4)
+
+  %and38 = and i8 %1, %0
+  %and39 = and i8 %and38, %2
+  %and2039 = and i8 %and39, %3
+  %and2240 = and i8 %and2039, %4
+  %and2441 = and i8 %and2240, %5
+  %and2442 = and i8 %and2441, %6
+  %conv27 = zext i8 %and2442 to i16
+  %and28 = and i16 %7, %conv27
+  ret i16 %and28
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)