Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1554,6 +1554,61 @@ return false; } +// Convert Cmp(Const,a) into canonical representation Cmp(a,Const) +// This function creates a canonical compare by reordering its operands if needed. +// as a result, the compare direction may also be changed in order to preserve the original compare meaning. +// Example(a < b) -> (b > a) +// Otherwise, the comparison value is still valid for the new operands order. +// Example(a == b) -> (b == a) +// +// TODO: Add canonical representation for SSE's compare intrinsics. +// Target must have VEX encoding!!! + +static bool X86CreateCanonicalCMP(IntrinsicInst *II) { + Value *LHS = II->getOperand(0); + Value *RHS = II->getOperand(1); + StringRef IntrinsicName = II->getCalledFunction()->getName(); + // This Assertion ensures that only avx and above intrinsics are passing + // the compare canonical representation. + assert(IntrinsicName.contains("avx") && + "Canonical representation support only intrinsics with VEX encoding"); + if (isa(LHS) && !isa(RHS)) { + assert((II->getOperand(2)->getType() == Type::getInt32Ty(II->getContext())) + && "Operand must defined by int32 type" ); + ConstantInt *ComparisonValue = dyn_cast(II->getOperand(2)); + uint64_t ConstantValue = ComparisonValue->getZExtValue(); + // When the lower bits of the compare are "01" or "10" (e.g. "1" or "2"), + // they represent a "direction" ('<','<='...) types of comparisons + // for which we need to change the direction of the compare when operators + // are exchanged. + // + // In the 128-bit Legacy SSE version: The comparison predicate operand is + // an 8 - bit immediate, bits 2:0 of the immediate define the type of + // comparison to be performed. Bits 7 : 3 of the immediate is reserved. + // + // The three first bits represent: + // Equal, Less-than, Less-than-or-equal, Unordered, Not-equal, + // Not less-than, Not less-than-or-equal and Ordered. + // + // In the VEX version: Two more bits were added to the immediate. For the + // "relation" types (<,<=,!<,!<=) the fourth bit represent the + // "greater relation". By using "Bit flipping" operation on the 4 low bits + // of the "relation" types. We are creating a one to one match between + // less to the equivalent upside greater. + // ( xor(immediate('<'),0x0F) = immediate('>')) + // This behaviour is true also for immediate with 0X10h. + if ((ConstantValue & 0x3) == 1 || (ConstantValue & 0x3) == 2) { + const APInt NewComparison(32, (ConstantValue ^ 0xf)); + II->setOperand(2, ConstantInt::get(Type::getInt32Ty(II->getContext()), + NewComparison)); + } + II->setArgOperand(0, RHS); + II->setArgOperand(1, LHS); + return true; + } + return false; +} + // Convert NVVM intrinsics to target-generic LLVM code where possible. static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // Each NVVM intrinsic we can simplify can be replaced with one of: @@ -2290,6 +2345,11 @@ break; } + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: + if (X86CreateCanonicalCMP(II)) + return II; + LLVM_FALLTHROUGH; case Intrinsic::x86_sse_comieq_ss: case Intrinsic::x86_sse_comige_ss: case Intrinsic::x86_sse_comigt_ss: @@ -2315,9 +2375,7 @@ case Intrinsic::x86_sse2_ucomilt_sd: case Intrinsic::x86_sse2_ucomineq_sd: case Intrinsic::x86_avx512_vcomi_ss: - case Intrinsic::x86_avx512_vcomi_sd: - case Intrinsic::x86_avx512_mask_cmp_ss: - case Intrinsic::x86_avx512_mask_cmp_sd: { + case Intrinsic::x86_avx512_vcomi_sd: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. bool MadeChange = false; @@ -2336,6 +2394,15 @@ return II; break; } + case Intrinsic::x86_avx512_mask_cmp_pd_128: + case Intrinsic::x86_avx512_mask_cmp_pd_256: + case Intrinsic::x86_avx512_mask_cmp_pd_512: + case Intrinsic::x86_avx512_mask_cmp_ps_128: + case Intrinsic::x86_avx512_mask_cmp_ps_256: + case Intrinsic::x86_avx512_mask_cmp_ps_512: + if(X86CreateCanonicalCMP(II)) + return II; + break; case Intrinsic::x86_avx512_mask_add_ps_512: case Intrinsic::x86_avx512_mask_div_ps_512: Index: test/Transforms/InstCombine/X86CanonicCmp.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/X86CanonicCmp.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +; This test checks that the canonical representation of compare intrinsics is working +; and converts the Cmp(const, a, comparison) into Cmp(a, const, flip(comparison)) with correct comparison immediate. +; This representation is valid only for AVX and above intrinsics. + +define i8 @canonical_compare_representationPD128(<2 x double> %a){ +; CHECK-LABEL: @canonical_compare_representationPD128( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A:%.*]], <2 x double> zeroinitializer, i32 10, i8 -1) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> zeroinitializer, <2 x double> %a, i32 5, i8 -1) + ret i8 %0 +} + +define i8 @canonical_compare_representationPD256(<4 x double> %a){ +; CHECK-LABEL: @canonical_compare_representationPD256( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A:%.*]], <4 x double> zeroinitializer, i32 10, i8 -1) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> %a, i32 5, i8 -1) + ret i8 %0 +} + +define i8 @canonical_compare_representationPD512(<8 x double> %a){ +; CHECK-LABEL: @canonical_compare_representationPD512( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> zeroinitializer, i32 11, i8 -1, i32 4) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> zeroinitializer, <8 x double> %a, i32 11, i8 -1, i32 4) + ret i8 %0 +} + +define i8 @canonical_compare_representationPS128(<4 x float> %a){ +; CHECK-LABEL: @canonical_compare_representationPS128( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A:%.*]], <4 x float> zeroinitializer, i32 12, i8 -1) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> zeroinitializer, <4 x float> %a, i32 12, i8 -1) + ret i8 %0 +} + +define i8 @canonical_compare_representationPS256(<8 x float> %a){ +; CHECK-LABEL: @canonical_compare_representationPS256( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A:%.*]], <8 x float> zeroinitializer, i32 10, i8 -1) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> %a, i32 5, i8 -1) + ret i8 %0 +} + +define i16 @canonical_compare_representationPS512(<16 x float> %a){ +; CHECK-LABEL: @canonical_compare_representationPS512( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> zeroinitializer, i32 11, i16 -1, i32 4) +; CHECK-NEXT: ret i16 [[TMP0]] +; +entry: + %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> zeroinitializer, <16 x float> %a, i32 11, i16 -1, i32 4) + ret i16 %0 +} + +define i8 @canonical_compare_representationSS(<4 x float> %a){ +; CHECK-LABEL: @canonical_compare_representationSS( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> , i32 10, i8 -1, i32 4) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> zeroinitializer, <4 x float> %a, i32 5, i8 -1, i32 4) + ret i8 %0 +} + +define i8 @canonical_compare_representationSD(<2 x double> %a){ +; CHECK-LABEL: @canonical_compare_representationSD( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> , i32 10, i8 -1, i32 4) +; CHECK-NEXT: ret i8 [[TMP0]] +; +entry: + %0 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> zeroinitializer, <2 x double> %a, i32 5, i8 -1, i32 4) + ret i8 %0 +} + + +declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, i8) +declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, i8) +declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, i8, i32) +declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, i8) +declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, i8) +declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, i16, i32) +declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32) +declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)