Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -968,6 +968,11 @@ SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; + // This function looks at SETCC that compares integers. It replaces SETCC + // with integer subtraction when (1) there is a legal way of doing it. + // (2) keeping the result of comparison in GPR has some performance benefit. + SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR) const override; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -9917,6 +9917,87 @@ return false; } + +// This function is called when we have proved that a SETCC node can be replaced +// by subtraction (and other supporting instructions) so that the result of +// comparison is kept in a GPR instead of CR. This function is purely for +// codegen purposes and has some flags to guide the codegen process. +static SDValue GenerateEquivalentSub(SDNode *N, int Size, bool Complement, + bool Swap, SDLoc &DL, SelectionDAG &DAG) { + + assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); + + // Zero extend the operands to the largest legal integer. Originally, they + // must be of a strictly smaller size. + auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0), + DAG.getConstant(Size, DL, MVT::i32)); + auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1), + DAG.getConstant(Size, DL, MVT::i32)); + + // Swap if needed. Depends on the condition code. + if (Swap) + std::swap(Op0, Op1); + + // Subtract extended integers. + auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1); + + // Move the sign bit to the least significant position and zero out the rest. + // Now the least significant bit carries the result of original comparison. + auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode, + DAG.getConstant(Size - 1, DL, MVT::i32)); + auto Final = Shifted; + + // Complement the result if needed. Based on the condition code. + if (Complement) + Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted, + DAG.getConstant(1, DL, MVT::i64)); + + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final); +} + +SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N, + DAGCombinerInfo &DCI) const { + + assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected."); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + // Size of integers being compared has a critical role in the following + // analysis, so we prefer to do this when all types are legal. + if (!DCI.isAfterLegalizeVectorOps()) + return SDValue(); + + // If all users of SETCC extend its value to a legal integer type + // then we replace SETCC with a subtraction + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); UI != UE; ++UI) { + if (UI->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + } + + ISD::CondCode CC = cast(N->getOperand(2))->get(); + auto OpSize = N->getOperand(0).getValueSizeInBits(); + + unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits(); + + if (OpSize < Size) { + switch (CC) { + default: break; + case ISD::SETULT: + return GenerateEquivalentSub(N, Size, false, false, DL, DAG); + case ISD::SETULE: + return GenerateEquivalentSub(N, Size, true, true, DL, DAG); + case ISD::SETUGT: + return GenerateEquivalentSub(N, Size, false, true, DL, DAG); + case ISD::SETUGE: + return GenerateEquivalentSub(N, Size, true, false, DL, DAG); + } + } + + return SDValue(); +} + SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -9958,7 +10039,8 @@ APInt::getHighBitsSet(OpBits, OpBits-1)) || !DAG.MaskedValueIsZero(N->getOperand(1), APInt::getHighBitsSet(OpBits, OpBits-1))) - return SDValue(); + return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI) + : SDValue()); } else { // This is neither a signed nor an unsigned comparison, just make sure // that the high bits are equal. Index: pzero-fp-xored.ll =================================================================== --- pzero-fp-xored.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: llc -mtriple=powerpc-unknown-linux-gnu -mattr=+vsx < %s | \ -; RUN: FileCheck %s --implicit-check-not lxvd2x --implicit-check-not lfs -; RUN: llc -mtriple=powerpc-unknown-linux-gnu -mattr=-vsx -mattr=-p8altivec < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-NVSXP8A --implicit-check-not xxlxor \ -; RUN: --implicit-check-not vxor - -define signext i32 @t1(float %x) local_unnamed_addr #0 { -entry: - %cmp = fcmp ogt float %x, 0.000000e+00 - %tmp = select i1 %cmp, i32 43, i32 11 - ret i32 %tmp - -; CHECK-LABEL: t1: -; CHECK: xxlxor [[REG1:[0-9]+]], [[REG1]], [[REG1]] -; CHECK: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG1]] -; CHECK: blr -; CHECK-NVSXP8A: lfs [[REG1:[0-9]+]] -; CHECK-NVSXP8A: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG1]] -; CHECK-NVSXP8A: blr -} - -define signext i32 @t2(double %x) local_unnamed_addr #0 { -entry: - %cmp = fcmp ogt double %x, 0.000000e+00 - %tmp = select i1 %cmp, i32 43, i32 11 - ret i32 %tmp - -; CHECK-LABEL: t2: -; CHECK: xxlxor [[REG2:[0-9]+]], [[REG2]], [[REG2]] -; CHECK: xscmpudp {{[0-9]+}}, {{[0-9]+}}, [[REG2]] -; CHECK: blr -; CHECK-NVSXP8A: lfs [[REG2:[0-9]+]] -; CHECK-NVSXP8A: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG2]] -; CHECK-NVSXP8A: blr -} - -define signext i32 @t3(ppc_fp128 %x) local_unnamed_addr #0 { -entry: - %cmp = fcmp ogt ppc_fp128 %x, 0xM00000000000000000000000000000000 - %tmp = select i1 %cmp, i32 43, i32 11 - ret i32 %tmp - -; CHECK-LABEL: t3: -; CHECK: xxlxor [[REG3:[0-9]+]], [[REG3]], [[REG3]] -; CHECK: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG3]] -; CHECK: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG3]] -; CHECK: blr -; CHECK-NVSXP8A: lfs [[REG3:[0-9]+]] -; CHECK-NVSXP8A: fcmpu {{[0-9]+}}, {{[0-9]+}}, [[REG3]] -; CHECK-NVSXP8A: blr -} - -define <2 x double> @t4() local_unnamed_addr #0 { - ret <2 x double> zeroinitializer -; CHECK-LABEL: t4: -; CHECK: vxor [[REG4:[0-9]+]], [[REG4]], [[REG4]] -; CHECK: blr -; CHECK-NVSXP8A: lfs [[REG4:[0-9]+]] -; CHECK-NVSXP8A: fmr {{[0-9]+}}, [[REG4:[0-9]+]] -; CHECK-NVSXP8A: blr -} - -define <2 x i64> @t5() local_unnamed_addr #0 { - ret <2 x i64> zeroinitializer -; CHECK-LABEL: t5: -; CHECK: vxor [[REG5:[0-9]+]], [[REG5]], [[REG5]] -; CHECK: blr -; CHECK-NVSXP8A: lvx -; CHECK-NVSXP8A: blr -} - Index: test/CodeGen/PowerPC/setcc-to-sub.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/setcc-to-sub.ll @@ -0,0 +1,96 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 < %s | FileCheck %s + +%class.PB2 = type { [1 x i32], %class.PB1* } +%class.PB1 = type { [1 x i32], i64, i64, i32 } + +; Function Attrs: norecurse nounwind readonly +define zeroext i1 @test1(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 { +entry: + %arrayidx.i6 = bitcast %class.PB2* %s_a to i32* + %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1 + %and.i = and i32 %0, 8 + %arrayidx.i37 = bitcast %class.PB2* %s_b to i32* + %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1 + %and.i4 = and i32 %1, 8 + %cmp.i5 = icmp ult i32 %and.i, %and.i4 + ret i1 %cmp.i5 + +; CHECK-LABEL: @test1 +; CHECK: rlwinm [[REG1:[0-9]*]] +; CHECK-NEXT: rlwinm [[REG2:[0-9]*]] +; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]] +; CHECK-NEXT: rldicl 3, [[REG3]] +; CHECK: blr + +} + +; Function Attrs: norecurse nounwind readonly +define zeroext i1 @test2(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 { +entry: + %arrayidx.i6 = bitcast %class.PB2* %s_a to i32* + %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1 + %and.i = and i32 %0, 8 + %arrayidx.i37 = bitcast %class.PB2* %s_b to i32* + %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1 + %and.i4 = and i32 %1, 8 + %cmp.i5 = icmp ule i32 %and.i, %and.i4 + ret i1 %cmp.i5 + +; CHECK-LABEL: @test2 +; CHECK: rlwinm [[REG1:[0-9]*]] +; CHECK-NEXT: rlwinm [[REG2:[0-9]*]] +; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]] +; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]] +; CHECK-NEXT: xori 3, [[REG4]], 1 +; CHECK: blr + +} + +; Function Attrs: norecurse nounwind readonly +define zeroext i1 @test3(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 { +entry: + %arrayidx.i6 = bitcast %class.PB2* %s_a to i32* + %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1 + %and.i = and i32 %0, 8 + %arrayidx.i37 = bitcast %class.PB2* %s_b to i32* + %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1 + %and.i4 = and i32 %1, 8 + %cmp.i5 = icmp ugt i32 %and.i, %and.i4 + ret i1 %cmp.i5 + +; CHECK-LABEL: @test3 +; CHECK: rlwinm [[REG1:[0-9]*]] +; CHECK-NEXT: rlwinm [[REG2:[0-9]*]] +; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]] +; CHECK-NEXT: rldicl 3, [[REG3]] +; CHECK: blr + +} + +; Function Attrs: norecurse nounwind readonly +define zeroext i1 @test4(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 { +entry: + %arrayidx.i6 = bitcast %class.PB2* %s_a to i32* + %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1 + %and.i = and i32 %0, 8 + %arrayidx.i37 = bitcast %class.PB2* %s_b to i32* + %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1 + %and.i4 = and i32 %1, 8 + %cmp.i5 = icmp uge i32 %and.i, %and.i4 + ret i1 %cmp.i5 + +; CHECK-LABEL: @test4 +; CHECK: rlwinm [[REG1:[0-9]*]] +; CHECK-NEXT: rlwinm [[REG2:[0-9]*]] +; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]] +; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]] +; CHECK-NEXT: xori 3, [[REG4]], 1 +; CHECK: blr + +} + +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"}