diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6283,12 +6283,6 @@ SDValue Zero = DAG.getConstant(0, sdl, VT); SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC); - auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; - if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { - setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); - return; - } - // When X == Y, this is rotate. If the data type has a power-of-2 size, we // avoid the select that is necessary in the general case to filter out // the 0-shift possibility that leads to UB. @@ -6318,6 +6312,12 @@ return; } + auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR; + if (TLI.isOperationLegalOrCustom(FunnelOpcode, VT)) { + setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z)); + return; + } + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) SDValue InvShAmt = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, ShAmt); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1117,6 +1117,7 @@ SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -613,6 +613,15 @@ setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } + // PowerPC has better expansions for funnel shifts than the generic + // TargetLowering::expandFunnelShift. + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::FSHL, MVT::i64, Custom); + setOperationAction(ISD::FSHR, MVT::i64, Custom); + } + setOperationAction(ISD::FSHL, MVT::i32, Custom); + setOperationAction(ISD::FSHR, MVT::i32, Custom); + if (Subtarget.hasVSX()) { setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); @@ -8972,6 +8981,31 @@ return DAG.getMergeValues(OutOps, dl); } +SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + + bool IsFSHL = Op.getOpcode() == ISD::FSHL; + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + SDValue Z = Op.getOperand(2); + EVT AmtVT = Z.getValueType(); + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + // This is simpler than TargetLowering::expandFunnelShift because we can rely + // on PowerPC shift by BW being well defined. + Z = DAG.getNode(ISD::AND, dl, AmtVT, Z, + DAG.getConstant(BitWidth - 1, dl, AmtVT)); + SDValue SubZ = + DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z); + X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ); + Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z); + return DAG.getNode(ISD::OR, dl, VT, X, Y); +} + //===----------------------------------------------------------------------===// // Vector related lowering. // @@ -11169,6 +11203,9 @@ case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + case ISD::FSHL: return LowerFunnelShift(Op, DAG); + case ISD::FSHR: return LowerFunnelShift(Op, DAG); + // Vector-related lowering. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -18,17 +18,29 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshl_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 31 +; CHECK-NEXT: clrlwi 5, 5, 27 ; CHECK-NEXT: subfic 6, 5, 32 -; CHECK-NEXT: slw 5, 3, 5 +; CHECK-NEXT: slw 3, 3, 5 ; CHECK-NEXT: srw 4, 4, 6 -; CHECK-NEXT: or 4, 5, 4 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f } +define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) { +; CHECK-LABEL: fshl_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: clrlwi 5, 5, 26 +; CHECK-NEXT: subfic 6, 5, 64 +; CHECK-NEXT: sld 3, 3, 5 +; CHECK-NEXT: srd 4, 4, 6 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: blr + %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z) + ret i64 %f +} + ; Verify that weird types are minimally supported. declare i37 @llvm.fshl.i37(i37, i37, i37) define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { @@ -72,9 +84,9 @@ define i32 @fshl_i32_const_shift(i32 %x, i32 %y) { ; CHECK-LABEL: fshl_i32_const_shift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotlwi 4, 4, 9 -; CHECK-NEXT: rlwimi 4, 3, 9, 0, 22 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: srwi 4, 4, 23 +; CHECK-NEXT: slwi 3, 3, 9 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9) ret i32 %f @@ -85,9 +97,9 @@ define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) { ; CHECK-LABEL: fshl_i32_const_overshift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotlwi 4, 4, 9 -; CHECK-NEXT: rlwimi 4, 3, 9, 0, 22 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: srwi 4, 4, 23 +; CHECK-NEXT: slwi 3, 3, 9 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41) ret i32 %f @@ -98,9 +110,9 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) { ; CHECK-LABEL: fshl_i64_const_overshift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotldi 4, 4, 41 -; CHECK-NEXT: rldimi 4, 3, 41, 0 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: rldicl 4, 4, 41, 23 +; CHECK-NEXT: sldi 3, 3, 41 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105) ret i64 %f @@ -124,17 +136,29 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshr_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: andi. 5, 5, 31 +; CHECK-NEXT: clrlwi 5, 5, 27 ; CHECK-NEXT: subfic 6, 5, 32 -; CHECK-NEXT: srw 5, 4, 5 +; CHECK-NEXT: srw 4, 4, 5 ; CHECK-NEXT: slw 3, 3, 6 -; CHECK-NEXT: or 3, 3, 5 -; CHECK-NEXT: iseleq 3, 4, 3 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f } +define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) { +; CHECK-LABEL: fshr_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: clrlwi 5, 5, 26 +; CHECK-NEXT: subfic 6, 5, 64 +; CHECK-NEXT: srd 4, 4, 5 +; CHECK-NEXT: sld 3, 3, 6 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: blr + %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z) + ret i64 %f +} + ; Verify that weird types are minimally supported. declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { @@ -178,9 +202,9 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) { ; CHECK-LABEL: fshr_i32_const_shift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotlwi 4, 4, 23 -; CHECK-NEXT: rlwimi 4, 3, 23, 0, 8 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: srwi 4, 4, 9 +; CHECK-NEXT: slwi 3, 3, 23 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9) ret i32 %f @@ -191,9 +215,9 @@ define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) { ; CHECK-LABEL: fshr_i32_const_overshift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotlwi 4, 4, 23 -; CHECK-NEXT: rlwimi 4, 3, 23, 0, 8 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: srwi 4, 4, 9 +; CHECK-NEXT: slwi 3, 3, 23 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41) ret i32 %f @@ -204,9 +228,9 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_i64_const_overshift: ; CHECK: # %bb.0: -; CHECK-NEXT: rotldi 4, 4, 23 -; CHECK-NEXT: rldimi 4, 3, 23, 0 -; CHECK-NEXT: mr 3, 4 +; CHECK-NEXT: rldicl 4, 4, 23, 41 +; CHECK-NEXT: sldi 3, 3, 23 +; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105) ret i64 %f diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-endian.ll b/llvm/test/CodeGen/PowerPC/ppcf128-endian.ll --- a/llvm/test/CodeGen/PowerPC/ppcf128-endian.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-endian.ll @@ -120,7 +120,7 @@ ; CHECK-LABEL: convert_to2: ; CHECK: std 3, [[OFF1:.*]](1) -; CHECK: std 5, [[OFF2:.*]](1) +; CHECK: std 4, [[OFF2:.*]](1) ; CHECK: lfd 1, [[OFF1]](1) ; CHECK: lfd 2, [[OFF2]](1) ; CHECK: blr diff --git a/llvm/test/CodeGen/PowerPC/pr44183.ll b/llvm/test/CodeGen/PowerPC/pr44183.ll --- a/llvm/test/CodeGen/PowerPC/pr44183.ll +++ b/llvm/test/CodeGen/PowerPC/pr44183.ll @@ -8,14 +8,20 @@ ; CHECK-LABEL: _ZN1m1nEv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r0, 16(r1) -; CHECK-NEXT: stdu r1, -48(r1) +; CHECK-NEXT: stdu r1, -64(r1) ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: ld r4, 8(r30) +; CHECK-NEXT: li r3, 4 +; CHECK-NEXT: ld r4, 16(r30) +; CHECK-NEXT: ld r5, 8(r30) +; CHECK-NEXT: subfic r29, r3, 64 +; CHECK-NEXT: rldicl r3, r5, 60, 4 +; CHECK-NEXT: sld r4, r4, r29 ; CHECK-NEXT: lwz r5, 36(r30) -; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: rlwinm r3, r4, 31, 0, 0 +; CHECK-NEXT: or r3, r4, r3 +; CHECK-NEXT: rlwinm r3, r3, 31, 0, 0 ; CHECK-NEXT: clrlwi r4, r5, 31 ; CHECK-NEXT: or r4, r4, r3 ; CHECK-NEXT: bl _ZN1llsE1d @@ -23,15 +29,16 @@ ; CHECK-NEXT: ld r3, 16(r30) ; CHECK-NEXT: ld r4, 8(r30) ; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: sldi r3, r3, 60 -; CHECK-NEXT: or r3, r4, r3 +; CHECK-NEXT: sld r3, r3, r29 +; CHECK-NEXT: or r3, r3, r4 ; CHECK-NEXT: sldi r3, r3, 31 ; CHECK-NEXT: clrldi r4, r3, 32 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop -; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: addi r1, r1, 64 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: mtlr r0 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/smulfixsat.ll b/llvm/test/CodeGen/PowerPC/smulfixsat.ll --- a/llvm/test/CodeGen/PowerPC/smulfixsat.ll +++ b/llvm/test/CodeGen/PowerPC/smulfixsat.ll @@ -6,21 +6,23 @@ define i32 @func1(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func1: ; CHECK: # %bb.0: +; CHECK-NEXT: mullw 6, 3, 4 ; CHECK-NEXT: lis 5, 32767 -; CHECK-NEXT: mulhw. 6, 3, 4 -; CHECK-NEXT: lis 7, -32768 -; CHECK-NEXT: mullw 3, 3, 4 +; CHECK-NEXT: mulhw. 3, 3, 4 +; CHECK-NEXT: srawi 4, 6, 31 +; CHECK-NEXT: cmplw 1, 3, 4 +; CHECK-NEXT: lis 3, -32768 ; CHECK-NEXT: ori 4, 5, 65535 -; CHECK-NEXT: srawi 5, 3, 31 -; CHECK-NEXT: cmplw 1, 6, 5 -; CHECK-NEXT: bc 12, 0, .LBB0_1 +; CHECK-NEXT: ori 5, 6, 0 +; CHECK-NEXT: bc 12, 0, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ori 3, 4, 0 ; CHECK-NEXT: b .LBB0_2 -; CHECK-NEXT: .LBB0_1: -; CHECK-NEXT: addi 4, 7, 0 ; CHECK-NEXT: .LBB0_2: -; CHECK-NEXT: bclr 12, 6, 0 -; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: ori 3, 4, 0 +; CHECK-NEXT: bc 12, 6, .LBB0_3 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: addi 3, 5, 0 ; CHECK-NEXT: blr %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 0) ret i32 %tmp @@ -29,23 +31,23 @@ define i32 @func2(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func2: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhw. 6, 3, 4 +; CHECK-NEXT: mullw 6, 3, 4 ; CHECK-NEXT: lis 5, 32767 -; CHECK-NEXT: mullw 3, 3, 4 -; CHECK-NEXT: rotlwi 3, 3, 31 -; CHECK-NEXT: ori 4, 5, 65535 -; CHECK-NEXT: rlwimi 3, 6, 31, 0, 0 +; CHECK-NEXT: ori 5, 5, 65535 +; CHECK-NEXT: mulhw. 3, 3, 4 +; CHECK-NEXT: srwi 4, 6, 1 +; CHECK-NEXT: slwi 6, 3, 31 +; CHECK-NEXT: or 4, 6, 4 ; CHECK-NEXT: bc 12, 1, .LBB1_1 ; CHECK-NEXT: b .LBB1_2 ; CHECK-NEXT: .LBB1_1: -; CHECK-NEXT: addi 3, 4, 0 +; CHECK-NEXT: addi 4, 5, 0 ; CHECK-NEXT: .LBB1_2: -; CHECK-NEXT: cmpwi 6, -1 -; CHECK-NEXT: lis 4, -32768 -; CHECK-NEXT: bc 12, 0, .LBB1_3 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: addi 3, 4, 0 +; CHECK-NEXT: cmpwi 3, -1 +; CHECK-NEXT: lis 3, -32768 +; CHECK-NEXT: bclr 12, 0, 0 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ori 3, 4, 0 ; CHECK-NEXT: blr %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 1) ret i32 %tmp diff --git a/llvm/test/CodeGen/PowerPC/umulfixsat.ll b/llvm/test/CodeGen/PowerPC/umulfixsat.ll --- a/llvm/test/CodeGen/PowerPC/umulfixsat.ll +++ b/llvm/test/CodeGen/PowerPC/umulfixsat.ll @@ -6,9 +6,10 @@ define i32 @func1(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func1: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, -1 ; CHECK-NEXT: mulhwu. 6, 3, 4 +; CHECK-NEXT: li 5, -1 ; CHECK-NEXT: mullw 3, 3, 4 +; CHECK-NEXT: ori 3, 3, 0 ; CHECK-NEXT: bclr 12, 2, 0 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: ori 3, 5, 0 @@ -20,15 +21,18 @@ define i32 @func2(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: func2: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhwu 6, 3, 4 +; CHECK-NEXT: mullw 6, 3, 4 ; CHECK-NEXT: li 5, -1 -; CHECK-NEXT: cmplwi 6, 1 -; CHECK-NEXT: mullw 3, 3, 4 -; CHECK-NEXT: rotlwi 3, 3, 31 -; CHECK-NEXT: rlwimi 3, 6, 31, 0, 0 -; CHECK-NEXT: bc 12, 1, .LBB1_1 +; CHECK-NEXT: mulhwu 3, 3, 4 +; CHECK-NEXT: srwi 4, 6, 1 +; CHECK-NEXT: slwi 6, 3, 31 +; CHECK-NEXT: or 4, 6, 4 +; CHECK-NEXT: cmplwi 3, 1 +; CHECK-NEXT: bc 12, 1, .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ori 3, 4, 0 ; CHECK-NEXT: blr -; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: addi 3, 5, 0 ; CHECK-NEXT: blr %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 1)