Index: lib/Analysis/InstructionSimplify.cpp =================================================================== --- lib/Analysis/InstructionSimplify.cpp +++ lib/Analysis/InstructionSimplify.cpp @@ -5037,7 +5037,16 @@ } case Intrinsic::fshl: case Intrinsic::fshr: { - Value *ShAmtArg = ArgBegin[2]; + Value *Op0 = ArgBegin[0], *Op1 = ArgBegin[1], *ShAmtArg = ArgBegin[2]; + + // If both operands are undef, the result is undef. + if (match(Op0, m_Undef()) && match(Op1, m_Undef())) + return UndefValue::get(F->getReturnType()); + + // If shift amount is undef, assume it is zero. + if (match(ShAmtArg, m_Undef())) + return ArgBegin[IID == Intrinsic::fshl ? 0 : 1]; + const APInt *ShAmtC; if (match(ShAmtArg, m_APInt(ShAmtC))) { // If there's effectively no shift, return the 1st arg or 2nd arg. Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1992,6 +1992,29 @@ case Intrinsic::fshl: case Intrinsic::fshr: { + const APInt *SA; + if (match(II->getArgOperand(2), m_APInt(SA))) { + unsigned BitWidth = SA->getBitWidth(); + uint64_t ShiftAmt = SA->urem(BitWidth); + // Zero shift is already handled in simplification. + if (ShiftAmt != 0) { + Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1); + // Normalize to funnel shift left. + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + // fshl(X, undef, C) -> shl X, C + if (match(Op1, m_Undef())) + return replaceInstUsesWith(*II, Builder.CreateShl(Op0, ShiftAmt)); + + // fshl(undef, X, C) -> lshr X, (BW-C) + if (match(Op0, m_Undef())) { + return replaceInstUsesWith(*II, + Builder.CreateLShr(Op1, BitWidth - ShiftAmt)); + } + } + } + // The shift amount (operand 2) of a funnel shift is modulo the bitwidth, // so only the low bits of the shift amount are demanded if the bitwidth is // a power-of-2. Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -690,6 +690,35 @@ // TODO: Could compute known zero/one bits based on the input. break; } + case Intrinsic::fshr: + case Intrinsic::fshl: { + const APInt *SA; + if (!match(I->getOperand(2), m_APInt(SA))) + break; + + // Zero shifts are handled during simplification. + uint64_t ShiftAmt = SA->urem(BitWidth); + if (ShiftAmt == 0) + break; + + // Normalize to funnel shift left. + if (II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmt = BitWidth - ShiftAmt; + + APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt)); + APInt DemandedMaskRHS( + DemandedMask.getLoBits(ShiftAmt).shl(BitWidth - ShiftAmt)); + + if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) || + SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1)) + return I; + + Known.Zero = LHSKnown.Zero.shl(ShiftAmt) + | RHSKnown.Zero.lshr(BitWidth - ShiftAmt); + Known.One = LHSKnown.One.shl(ShiftAmt) + | RHSKnown.One.lshr(BitWidth - ShiftAmt); + break; + } case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_sse2_movmsk_pd: Index: test/Transforms/InstCombine/fsh.ll =================================================================== --- test/Transforms/InstCombine/fsh.ll +++ test/Transforms/InstCombine/fsh.ll @@ -141,3 +141,171 @@ ret <2 x i31> %r } +; Simplify one undef operand and constant shift amount. + +define i32 @fshl_op0_undef(i32 %x) { +; CHECK-LABEL: @fshl_op0_undef( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 25 +; CHECK-NEXT: ret i32 [[TMP1]] +; + %r = call i32 @llvm.fshl.i32(i32 undef, i32 %x, i32 7) + ret i32 %r +} + +define i33 @fshr_op0_undef(i33 %x) { +; CHECK-LABEL: @fshr_op0_undef( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i33 [[X:%.*]], 7 +; CHECK-NEXT: ret i33 [[TMP1]] +; + %r = call i33 @llvm.fshr.i33(i33 undef, i33 %x, i33 7) + ret i33 %r +} + +define i32 @fshl_op1_undef(i32 %x) { +; CHECK-LABEL: @fshl_op1_undef( +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 7 +; CHECK-NEXT: ret i32 [[TMP1]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 undef, i32 7) + ret i32 %r +} + +define i33 @fshr_op1_undef(i33 %x) { +; CHECK-LABEL: @fshr_op1_undef( +; CHECK-NEXT: [[TMP1:%.*]] = shl i33 [[X:%.*]], 26 +; CHECK-NEXT: ret i33 [[TMP1]] +; + %r = call i33 @llvm.fshr.i33(i33 %x, i33 undef, i33 7) + ret i33 %r +} + +; Only demand bits from one of the operands. + +define i32 @fshl_only_op0_demanded(i32 %x, i32 %y) { +; CHECK-LABEL: @fshl_only_op0_demanded( +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 7 +; CHECK-NEXT: [[R:%.*]] = and i32 [[TMP1]], 128 +; CHECK-NEXT: ret i32 [[R]] +; + %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) + %r = and i32 %z, 128 + ret i32 %r +} + +define i32 @fshl_only_op1_demanded(i32 %x, i32 %y) { +; CHECK-LABEL: @fshl_only_op1_demanded( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[Y:%.*]], 25 +; CHECK-NEXT: [[R:%.*]] = and i32 [[TMP1]], 63 +; CHECK-NEXT: ret i32 [[R]] +; + %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) + %r = and i32 %z, 63 + ret i32 %r +} + +define i33 @fshr_only_op0_demanded(i33 %x, i33 %y) { +; CHECK-LABEL: @fshr_only_op0_demanded( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i33 [[Y:%.*]], 7 +; CHECK-NEXT: [[R:%.*]] = and i33 [[TMP1]], 12392 +; CHECK-NEXT: ret i33 [[R]] +; + %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 7) + %r = and i33 %z, 12392 + ret i33 %r +} + +define i33 @fshr_only_op1_demanded(i33 %x, i33 %y) { +; CHECK-LABEL: @fshr_only_op1_demanded( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i33 [[X:%.*]], 4 +; CHECK-NEXT: [[R:%.*]] = and i33 [[TMP1]], 7 +; CHECK-NEXT: ret i33 [[R]] +; + %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 7) + %r = lshr i33 %z, 30 + ret i33 %r +} + +; Demand bits from both operands -- cannot simplify. + +define i32 @fshl_both_ops_demanded(i32 %x, i32 %y) { +; CHECK-LABEL: @fshl_both_ops_demanded( +; CHECK-NEXT: [[Z:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 7) +; CHECK-NEXT: [[R:%.*]] = and i32 [[Z]], 192 +; CHECK-NEXT: ret i32 [[R]] +; + %z = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) + %r = and i32 %z, 192 + ret i32 %r +} + +define i33 @fshr_both_ops_demanded(i33 %x, i33 %y) { +; CHECK-LABEL: @fshr_both_ops_demanded( +; CHECK-NEXT: [[Z:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 26) +; CHECK-NEXT: [[R:%.*]] = and i33 [[Z]], 192 +; CHECK-NEXT: ret i33 [[R]] +; + %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 26) + %r = and i33 %z, 192 + ret i33 %r +} + +; Both operands are demanded, but there are known bits. + +define i32 @fshl_known_bits(i32 %x, i32 %y) { +; CHECK-LABEL: @fshl_known_bits( +; CHECK-NEXT: ret i32 128 +; + %x2 = or i32 %x, 1 ; lo bit set + %y2 = lshr i32 %y, 1 ; hi bit clear + %z = call i32 @llvm.fshl.i32(i32 %x2, i32 %y2, i32 7) + %r = and i32 %z, 192 + ret i32 %r +} + +define i33 @fshr_known_bits(i33 %x, i33 %y) { +; CHECK-LABEL: @fshr_known_bits( +; CHECK-NEXT: ret i33 128 +; + %x2 = or i33 %x, 1 ; lo bit set + %y2 = lshr i33 %y, 1 ; hi bit set + %z = call i33 @llvm.fshr.i33(i33 %x2, i33 %y2, i33 26) + %r = and i33 %z, 192 + ret i33 %r +} + +; This case fails to simplify due to multiple uses. + +define i33 @fshr_multi_use(i33 %a) { +; CHECK-LABEL: @fshr_multi_use( +; CHECK-NEXT: [[B:%.*]] = tail call i33 @llvm.fshr.i33(i33 [[A:%.*]], i33 [[A]], i33 1) +; CHECK-NEXT: [[C:%.*]] = lshr i33 [[B]], 23 +; CHECK-NEXT: [[D:%.*]] = xor i33 [[C]], [[B]] +; CHECK-NEXT: [[E:%.*]] = and i33 [[D]], 31 +; CHECK-NEXT: ret i33 [[E]] +; + %b = tail call i33 @llvm.fshr.i33(i33 %a, i33 %a, i33 1) + %c = lshr i33 %b, 23 + %d = xor i33 %c, %b + %e = and i33 %d, 31 + ret i33 %e +} + +; This demonstrates the same simplification working if the fshr intrinsic +; is expanded into shifts and or. + +define i33 @expanded_fshr_multi_use(i33 %a) { +; CHECK-LABEL: @expanded_fshr_multi_use( +; CHECK-NEXT: [[TMP:%.*]] = lshr i33 [[A:%.*]], 1 +; CHECK-NEXT: [[C:%.*]] = lshr i33 [[A]], 24 +; CHECK-NEXT: [[D:%.*]] = xor i33 [[C]], [[TMP]] +; CHECK-NEXT: [[E:%.*]] = and i33 [[D]], 31 +; CHECK-NEXT: ret i33 [[E]] +; + %tmp = lshr i33 %a, 1 + %tmp2 = shl i33 %a, 32 + %b = or i33 %tmp, %tmp2 + %c = lshr i33 %b, 23 + %d = xor i33 %c, %b + %e = and i33 %d, 31 + ret i33 %e +} Index: test/Transforms/InstSimplify/call.ll =================================================================== --- test/Transforms/InstSimplify/call.ll +++ test/Transforms/InstSimplify/call.ll @@ -628,3 +628,38 @@ ret <2 x i8> %s } +; If first two operands of funnel shift are undef, the result is undef + +define i8 @fshl_ops_undef(i8 %shamt) { +; CHECK-LABEL: @fshl_ops_undef( +; CHECK-NEXT: ret i8 undef +; + %r = call i8 @llvm.fshl.i8(i8 undef, i8 undef, i8 %shamt) + ret i8 %r +} + +define i9 @fshr_ops_undef(i9 %shamt) { +; CHECK-LABEL: @fshr_ops_undef( +; CHECK-NEXT: ret i9 undef +; + %r = call i9 @llvm.fshr.i9(i9 undef, i9 undef, i9 %shamt) + ret i9 %r +} + +; If shift amount is undef, treat it as zero, returning operand 0 or 1 + +define i8 @fshl_shift_undef(i8 %x, i8 %y) { +; CHECK-LABEL: @fshl_shift_undef( +; CHECK-NEXT: ret i8 [[X:%.*]] +; + %r = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 undef) + ret i8 %r +} + +define i9 @fshr_shift_undef(i9 %x, i9 %y) { +; CHECK-LABEL: @fshr_shift_undef( +; CHECK-NEXT: ret i9 [[Y:%.*]] +; + %r = call i9 @llvm.fshr.i9(i9 %x, i9 %y, i9 undef) + ret i9 %r +}