Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1554,6 +1554,61 @@
   return false;
 }
 
+// Convert Cmp(Const,a) into canonical representation Cmp(a,Const)
+// This function creates a canonical compare by reordering its operands if needed.
+// as a result, the compare direction may also be changed in order to preserve the original compare meaning.
+// Example(a < b) -> (b > a)
+// Otherwise, the comparison value is still valid for the new operands order.
+// Example(a == b) -> (b == a)
+//
+// TODO: Add canonical representation for SSE's compare intrinsics.
+// Target must have VEX encoding!!!
+
+static bool X86CreateCanonicalCMP(IntrinsicInst *II) {
+  Value *LHS = II->getOperand(0);
+  Value *RHS = II->getOperand(1);
+  StringRef IntrinsicName = II->getCalledFunction()->getName();
+  // This Assertion ensures that only avx and above intrinsics are passing
+  // the compare canonical representation.
+  assert(IntrinsicName.contains("avx") &&
+         "Canonical representation support only intrinsics with VEX encoding");
+  if (isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+    assert((II->getOperand(2)->getType() == Type::getInt32Ty(II->getContext()))
+            && "Operand must defined by int32 type" );
+    ConstantInt *ComparisonValue = dyn_cast<ConstantInt>(II->getOperand(2));
+    uint64_t ConstantValue = ComparisonValue->getZExtValue();
+    // When the lower bits of the compare are "01" or "10" (e.g. "1" or "2"),
+    // they represent a "direction" ('<','<='...) types of comparisons
+    // for which we need to change the direction of the compare when operators
+    // are exchanged.
+    //
+    // In the 128-bit Legacy SSE version: The comparison predicate operand is
+    // an 8 - bit immediate, bits 2:0 of the immediate define the type of
+    // comparison to be performed. Bits 7 : 3 of the immediate is reserved.
+    //
+    // The three first bits represent:
+    // Equal, Less-than, Less-than-or-equal, Unordered, Not-equal,
+    // Not less-than, Not less-than-or-equal and Ordered.
+    //
+    // In the VEX version: Two more bits were added to the immediate. For the
+    // "relation" types (<,<=,!<,!<=) the fourth bit represent the
+    // "greater relation". By using "Bit flipping" operation on the 4 low bits
+    // of the "relation" types. We are creating a one to one match between
+    // less to the equivalent upside greater.
+    // ( xor(immediate('<'),0x0F) = immediate('>'))
+    // This behaviour is true also for immediate with 0X10h.
+    if ((ConstantValue & 0x3) == 1 || (ConstantValue & 0x3) == 2) {
+      const APInt NewComparison(32, (ConstantValue ^ 0xf));
+      II->setOperand(2, ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                         NewComparison));
+    }
+    II->setArgOperand(0, RHS);
+    II->setArgOperand(1, LHS);
+    return true;
+  }
+  return false;
+}
+
 // Convert NVVM intrinsics to target-generic LLVM code where possible.
 static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // Each NVVM intrinsic we can simplify can be replaced with one of:
@@ -2290,6 +2345,11 @@
     break;
   }
 
+  case Intrinsic::x86_avx512_mask_cmp_ss:
+  case Intrinsic::x86_avx512_mask_cmp_sd:
+    if (X86CreateCanonicalCMP(II))
+      return II;
+    LLVM_FALLTHROUGH;
   case Intrinsic::x86_sse_comieq_ss:
   case Intrinsic::x86_sse_comige_ss:
   case Intrinsic::x86_sse_comigt_ss:
@@ -2315,9 +2375,7 @@
   case Intrinsic::x86_sse2_ucomilt_sd:
   case Intrinsic::x86_sse2_ucomineq_sd:
   case Intrinsic::x86_avx512_vcomi_ss:
-  case Intrinsic::x86_avx512_vcomi_sd:
-  case Intrinsic::x86_avx512_mask_cmp_ss:
-  case Intrinsic::x86_avx512_mask_cmp_sd: {
+  case Intrinsic::x86_avx512_vcomi_sd: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
     bool MadeChange = false;
@@ -2336,6 +2394,15 @@
       return II;
     break;
   }
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512:
+    if(X86CreateCanonicalCMP(II))
+      return II;
+    break;
 
   case Intrinsic::x86_avx512_mask_add_ps_512:
   case Intrinsic::x86_avx512_mask_div_ps_512:
Index: test/Transforms/InstCombine/X86CanonicCmp.ll
===================================================================
--- /dev/null
+++ test/Transforms/InstCombine/X86CanonicCmp.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; This test checks that the canonical representation of compare intrinsics is working
+; and converts the Cmp(const, a, comparison) into Cmp(a, const, flip(comparison)) with correct comparison immediate.
+; This representation is valid only for AVX and above intrinsics.
+
+define i8 @canonical_compare_representationPD128(<2 x double> %a){
+; CHECK-LABEL: @canonical_compare_representationPD128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> zeroinitializer, i32 10, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %a, i32 5, i8 -1)
+  ret i8 %0
+}
+
+define i8 @canonical_compare_representationPD256(<4 x double> %a){
+; CHECK-LABEL: @canonical_compare_representationPD256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> zeroinitializer, i32 10, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %a, i32 5, i8 -1)
+  ret i8 %0
+}
+
+define i8 @canonical_compare_representationPD512(<8 x double> %a){
+; CHECK-LABEL: @canonical_compare_representationPD512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> zeroinitializer, i32 11, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %a, i32 11, i8 -1, i32 4)
+  ret i8 %0
+}
+
+define i8 @canonical_compare_representationPS128(<4 x float> %a){
+; CHECK-LABEL: @canonical_compare_representationPS128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> zeroinitializer, i32 12, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %a, i32 12, i8 -1)
+  ret i8 %0
+}
+
+define i8 @canonical_compare_representationPS256(<8 x float> %a){
+; CHECK-LABEL: @canonical_compare_representationPS256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> zeroinitializer, i32 10, i8 -1)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %a, i32 5, i8 -1)
+  ret i8 %0
+}
+
+define i16 @canonical_compare_representationPS512(<16 x float> %a){
+; CHECK-LABEL: @canonical_compare_representationPS512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> zeroinitializer, i32 11, i16 -1, i32 4)
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %a, i32 11, i16 -1, i32 4)
+  ret i16 %0
+}
+
+define i8 @canonical_compare_representationSS(<4 x float> %a){
+; CHECK-LABEL: @canonical_compare_representationSS(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, i32 10, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> zeroinitializer, <4 x float> %a, i32 5, i8 -1, i32 4)
+  ret i8 %0
+}
+
+define i8 @canonical_compare_representationSD(<2 x double> %a){
+; CHECK-LABEL: @canonical_compare_representationSD(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> <double 0.000000e+00, double undef>, i32 10, i8 -1, i32 4)
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %0 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> zeroinitializer, <2 x double> %a, i32 5, i8 -1, i32 4)
+  ret i8 %0
+}
+
+
+declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8)
+declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8)
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)