diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6254,12 +6254,6 @@ SDValue Zero = DAG.getConstant(0, sdl, VT); SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC); - auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; - if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { - setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); - return; - } - // When X == Y, this is rotate. If the data type has a power-of-2 size, we // avoid the select that is necessary in the general case to filter out // the 0-shift possibility that leads to UB. @@ -6289,6 +6283,12 @@ return; } + auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; + if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { + setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); + return; + } + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt); diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1253,6 +1253,7 @@ } break; case ISD::SHL: + case PPCISD::SHL: if (isa(V.getOperand(1))) { unsigned ShiftAmt = V.getConstantOperandVal(1); @@ -1268,6 +1269,7 @@ } break; case ISD::SRL: + case PPCISD::SRL: if (isa(V.getOperand(1))) { unsigned ShiftAmt = V.getConstantOperandVal(1); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1092,6 +1092,7 @@ SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -617,6 +617,15 @@ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } + // PowerPC has better expansions for funnel shifts than the generic + // TargetLowering::expandFunnelShift. + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::FSHL, MVT::i64, Custom); + setOperationAction(ISD::FSHR, MVT::i64, Custom); + } + setOperationAction(ISD::FSHL, MVT::i32, Custom); + setOperationAction(ISD::FSHR, MVT::i32, Custom); + if (Subtarget.hasVSX()) { setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); @@ -8626,6 +8635,31 @@ return DAG.getMergeValues(OutOps, dl); } +SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + + bool IsFSHL = Op.getOpcode() == ISD::FSHL; + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + SDValue Z = Op.getOperand(2); + EVT AmtVT = Z.getValueType(); + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + // This is simpler than TargetLowering::expandFunnelShift because we can rely + // on PowerPC shift by BW being well defined. + Z = DAG.getNode(ISD::AND, dl, AmtVT, Z, + DAG.getConstant(BitWidth - 1, dl, AmtVT)); + SDValue SubZ = + DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z); + X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ); + Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z); + return DAG.getNode(ISD::OR, dl, VT, X, Y); +} + //===----------------------------------------------------------------------===// // Vector related lowering. // @@ -10421,6 +10455,9 @@ case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + case ISD::FSHL: return LowerFunnelShift(Op, DAG); + case ISD::FSHR: return LowerFunnelShift(Op, DAG); + // Vector-related lowering. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -18,12 +18,11 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshl_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 31 +; CHECK-NEXT: clrlwi 5, 5, 27 ; CHECK-NEXT: subfic 6, 5, 32 -; CHECK-NEXT: slw 5, 3, 5 +; CHECK-NEXT: slw 3, 3, 5 ; CHECK-NEXT: srw 4, 4, 6 -; CHECK-NEXT: or 4, 5, 4 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -32,12 +31,11 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-LABEL: fshl_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 63 +; CHECK-NEXT: clrlwi 5, 5, 26 ; CHECK-NEXT: subfic 6, 5, 64 -; CHECK-NEXT: sld 5, 3, 5 +; CHECK-NEXT: sld 3, 3, 5 ; CHECK-NEXT: srd 4, 4, 6 -; CHECK-NEXT: or 4, 5, 4 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z) ret i64 %f @@ -138,12 +136,11 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshr_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 31 +; CHECK-NEXT: clrlwi 5, 5, 27 ; CHECK-NEXT: subfic 6, 5, 32 -; CHECK-NEXT: srw 5, 4, 5 +; CHECK-NEXT: srw 4, 4, 5 ; CHECK-NEXT: slw 3, 3, 6 -; CHECK-NEXT: or 3, 3, 5 -; CHECK-NEXT: iseleq 3, 4, 3 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -152,12 +149,11 @@ define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-LABEL: fshr_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 63 +; CHECK-NEXT: clrlwi 5, 5, 26 ; CHECK-NEXT: subfic 6, 5, 64 -; CHECK-NEXT: srd 5, 4, 5 +; CHECK-NEXT: srd 4, 4, 5 ; CHECK-NEXT: sld 3, 3, 6 -; CHECK-NEXT: or 3, 3, 5 -; CHECK-NEXT: iseleq 3, 4, 3 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z) ret i64 %f diff --git a/llvm/test/CodeGen/PowerPC/pr44183.ll b/llvm/test/CodeGen/PowerPC/pr44183.ll --- a/llvm/test/CodeGen/PowerPC/pr44183.ll +++ b/llvm/test/CodeGen/PowerPC/pr44183.ll @@ -8,14 +8,20 @@ ; CHECK-LABEL: _ZN1m1nEv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r0, 16(r1) -; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: stdu r1, -64(r1) ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: ld r4, 8(r30) +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: ld r4, 16(r30) +; CHECK-NEXT: ld r5, 8(r30) +; CHECK-NEXT: subfic r29, r3, 64 +; CHECK-NEXT: rldicl r3, r5, 60, 4 +; CHECK-NEXT: sld r4, r4, r29 ; CHECK-NEXT: lwz r5, 36(r30) -; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: rlwinm r3, r4, 31, 0, 0 +; CHECK-NEXT: or r3, r4, r3 +; CHECK-NEXT: rlwinm r3, r3, 31, 0, 0 ; CHECK-NEXT: clrlwi r4, r5, 31 ; CHECK-NEXT: or r4, r4, r3 ; CHECK-NEXT: bl _ZN1llsE1d @@ -23,15 +29,16 @@ ; CHECK-NEXT: ld r3, 16(r30) ; CHECK-NEXT: ld r4, 8(r30) ; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: sldi r3, r3, 60 -; CHECK-NEXT: or r3, r4, r3 +; CHECK-NEXT: sld r3, r3, r29 +; CHECK-NEXT: or r3, r3, r4 ; CHECK-NEXT: sldi r3, r3, 31 ; CHECK-NEXT: clrldi r4, r3, 32 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop -; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: addi r1, r1, 64 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: mtlr r0 ; CHECK-NEXT: blr entry: