diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2087,8 +2087,6 @@ // TODO: Can we reduce the code duplication between this and the related // rotate matching code under visitSelect and visitTrunc? unsigned Width = Or.getType()->getScalarSizeInBits(); - if (!isPowerOf2_32(Width)) - return nullptr; // First, find an or'd pair of opposite shifts with the same shifted operand: // or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1) @@ -2110,6 +2108,18 @@ // Match the shift amount operands for a rotate pattern. This always matches // a subtraction on the R operand. auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * { + // Check for constant shift amounts that sum to the bitwidth. + // TODO: Support non-uniform shift amounts. + const APInt *LC, *RC; + if (match(L, m_APInt(LC)) && match(R, m_APInt(RC))) + if (LC->ult(Width) && RC->ult(Width) && (*LC + *RC) == Width) + return L; + + // For non-constant cases we don't support non-pow2 shift masks. + // TODO: Is it worth matching urem as well? + if (!isPowerOf2_32(Width)) + return nullptr; + // The shift amount may be masked with negation: // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1))) Value *X; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -2940,6 +2940,43 @@ Result->Provenance[i] = BitPart::Unset; return Result; } + + if (auto *CI = dyn_cast(I)) { + Value *Callee = CI->getCalledOperand(); + // Handle intrinsic calls. + if (auto *F = dyn_cast(Callee)) { + Intrinsic::ID IntrinsicID = F->getIntrinsicID(); + + // Funnel 'double' shifts take 3 operands, 2 inputs and the shift + // amount (modulo). + // fshl(X,Y,Z): (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + const APInt *Amt; + if ((IntrinsicID == Intrinsic::fshl || + IntrinsicID == Intrinsic::fshr) && + match(CI->getArgOperand(2), m_APInt(Amt))) { + + // We can treat fshr as a fshl by flipping the modulo amount. + unsigned ModAmt = Amt->urem(BitWidth); + if (IntrinsicID == Intrinsic::fshr) + ModAmt = BitWidth - ModAmt; + + const auto &LHS = collectBitParts(CI->getArgOperand(0), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); + const auto &RHS = collectBitParts(CI->getArgOperand(1), MatchBSwaps, + MatchBitReversals, BPS, Depth + 1); + if (!LHS || !RHS) + return Result; + + Result = LHS; + for (unsigned I = 0; I < (BitWidth - ModAmt); ++I) + Result->Provenance[I] = LHS->Provenance[I + ModAmt]; + for (unsigned I = 0; I < ModAmt; ++I) + Result->Provenance[BitWidth - ModAmt + I] = RHS->Provenance[I]; + return Result; + } + } + } } // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll --- a/llvm/test/Transforms/InstCombine/bswap.ll +++ b/llvm/test/Transforms/InstCombine/bswap.ll @@ -123,9 +123,7 @@ define i32 @bswap32_and_first_extra_use(i32 %x) { ; CHECK-LABEL: @bswap32_and_first_extra_use( -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 16 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[SWAPHALF:%.*]] = or i32 [[SHL]], [[SHR]] +; CHECK-NEXT: [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16) ; CHECK-NEXT: [[T:%.*]] = and i32 [[SWAPHALF]], 16711935 ; CHECK-NEXT: [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]]) ; CHECK-NEXT: call void @extra_use(i32 [[T]]) @@ -169,10 +167,8 @@ define i32 @bswap32_shl_first_extra_use(i32 %x) { ; CHECK-LABEL: @bswap32_shl_first_extra_use( -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X]], 24 -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[SHR]], 8 -; CHECK-NEXT: [[T:%.*]] = or i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16) +; CHECK-NEXT: [[T:%.*]] = shl i32 [[SWAPHALF]], 8 ; CHECK-NEXT: [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]]) ; CHECK-NEXT: call void @extra_use(i32 [[T]]) ; CHECK-NEXT: ret i32 [[BSWAP]] diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -521,9 +521,9 @@ define i33 @expanded_fshr_multi_use(i33 %a) { ; CHECK-LABEL: @expanded_fshr_multi_use( -; CHECK-NEXT: [[TMP:%.*]] = lshr i33 [[A:%.*]], 1 -; CHECK-NEXT: [[C:%.*]] = lshr i33 [[A]], 24 -; CHECK-NEXT: [[D:%.*]] = xor i33 [[C]], [[TMP]] +; CHECK-NEXT: [[B:%.*]] = call i33 @llvm.fshl.i33(i33 [[A:%.*]], i33 [[A]], i33 32) +; CHECK-NEXT: [[C:%.*]] = lshr i33 [[B]], 23 +; CHECK-NEXT: [[D:%.*]] = xor i33 [[C]], [[B]] ; CHECK-NEXT: [[E:%.*]] = and i33 [[D]], 31 ; CHECK-NEXT: ret i33 [[E]] ; diff --git a/llvm/test/Transforms/InstCombine/or-concat.ll b/llvm/test/Transforms/InstCombine/or-concat.ll --- a/llvm/test/Transforms/InstCombine/or-concat.ll +++ b/llvm/test/Transforms/InstCombine/or-concat.ll @@ -47,11 +47,9 @@ define i64 @concat_bswap32_unary_flip(i64 %a0) { ; CHECK-LABEL: @concat_bswap32_unary_flip( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[A0:%.*]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[A0]], 32 -; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) -; CHECK-NEXT: ret i64 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[A0:%.*]], i64 [[A0]], i64 32) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) +; CHECK-NEXT: ret i64 [[TMP2]] ; %1 = lshr i64 %a0, 32 %2 = trunc i64 %1 to i32 @@ -67,11 +65,9 @@ define <2 x i64> @concat_bswap32_unary_flip_vector(<2 x i64> %a0) { ; CHECK-LABEL: @concat_bswap32_unary_flip_vector( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[A0:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[A0]], -; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP3]]) -; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[A0:%.*]], <2 x i64> [[A0]], <2 x i64> ) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]]) +; CHECK-NEXT: ret <2 x i64> [[TMP2]] ; %1 = lshr <2 x i64> %a0, %2 = trunc <2 x i64> %1 to <2 x i32> @@ -162,11 +158,9 @@ define i64 @concat_bitreverse32_unary_flip(i64 %a0) { ; CHECK-LABEL: @concat_bitreverse32_unary_flip( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[A0:%.*]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[A0]], 32 -; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[TMP3]]) -; CHECK-NEXT: ret i64 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[A0:%.*]], i64 [[A0]], i64 32) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[TMP1]]) +; CHECK-NEXT: ret i64 [[TMP2]] ; %1 = lshr i64 %a0, 32 %2 = trunc i64 %1 to i32 @@ -182,11 +176,9 @@ define <2 x i64> @concat_bitreverse32_unary_flip_vector(<2 x i64> %a0) { ; CHECK-LABEL: @concat_bitreverse32_unary_flip_vector( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[A0:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[A0]], -; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]]) -; CHECK-NEXT: ret <2 x i64> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[A0:%.*]], <2 x i64> [[A0]], <2 x i64> ) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; CHECK-NEXT: ret <2 x i64> [[TMP2]] ; %1 = lshr <2 x i64> %a0, %2 = trunc <2 x i64> %1 to <2 x i32> diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll --- a/llvm/test/Transforms/InstCombine/rotate.ll +++ b/llvm/test/Transforms/InstCombine/rotate.ll @@ -3,16 +3,14 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" -; TODO: Canonicalize rotate by constant to funnel shift intrinsics. +; Canonicalize rotate by constant to funnel shift intrinsics. ; This should help cost modeling for vectorization, inlining, etc. ; If a target does not have a rotate instruction, the expansion will ; be exactly these same 3 basic ops (shl/lshr/or). define i32 @rotl_i32_constant(i32 %x) { ; CHECK-LABEL: @rotl_i32_constant( -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 11 -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 21 -; CHECK-NEXT: [[R:%.*]] = or i32 [[SHR]], [[SHL]] +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 11) ; CHECK-NEXT: ret i32 [[R]] ; %shl = shl i32 %x, 11 @@ -23,9 +21,7 @@ define i42 @rotr_i42_constant(i42 %x) { ; CHECK-LABEL: @rotr_i42_constant( -; CHECK-NEXT: [[SHL:%.*]] = shl i42 [[X:%.*]], 31 -; CHECK-NEXT: [[SHR:%.*]] = lshr i42 [[X]], 11 -; CHECK-NEXT: [[R:%.*]] = or i42 [[SHR]], [[SHL]] +; CHECK-NEXT: [[R:%.*]] = call i42 @llvm.fshl.i42(i42 [[X:%.*]], i42 [[X]], i42 31) ; CHECK-NEXT: ret i42 [[R]] ; %shl = shl i42 %x, 31 @@ -36,9 +32,7 @@ define i8 @rotr_i8_constant_commute(i8 %x) { ; CHECK-LABEL: @rotr_i8_constant_commute( -; CHECK-NEXT: [[SHL:%.*]] = shl i8 [[X:%.*]], 5 -; CHECK-NEXT: [[SHR:%.*]] = lshr i8 [[X]], 3 -; CHECK-NEXT: [[R:%.*]] = or i8 [[SHL]], [[SHR]] +; CHECK-NEXT: [[R:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 5) ; CHECK-NEXT: ret i8 [[R]] ; %shl = shl i8 %x, 5 @@ -49,9 +43,7 @@ define i88 @rotl_i88_constant_commute(i88 %x) { ; CHECK-LABEL: @rotl_i88_constant_commute( -; CHECK-NEXT: [[SHL:%.*]] = shl i88 [[X:%.*]], 44 -; CHECK-NEXT: [[SHR:%.*]] = lshr i88 [[X]], 44 -; CHECK-NEXT: [[R:%.*]] = or i88 [[SHL]], [[SHR]] +; CHECK-NEXT: [[R:%.*]] = call i88 @llvm.fshl.i88(i88 [[X:%.*]], i88 [[X]], i88 44) ; CHECK-NEXT: ret i88 [[R]] ; %shl = shl i88 %x, 44 @@ -64,9 +56,7 @@ define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) { ; CHECK-LABEL: @rotl_v2i16_constant_splat( -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], -; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i16> [[X]], -; CHECK-NEXT: [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> ) ; CHECK-NEXT: ret <2 x i16> [[R]] ; %shl = shl <2 x i16> %x, @@ -79,9 +69,7 @@ define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) { ; CHECK-LABEL: @rotr_v2i17_constant_splat( -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i17> [[X:%.*]], -; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i17> [[X]], -; CHECK-NEXT: [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]] +; CHECK-NEXT: [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> ) ; CHECK-NEXT: ret <2 x i17> [[R]] ; %shl = shl <2 x i17> %x, @@ -90,7 +78,7 @@ ret <2 x i17> %r } -; Allow arbitrary shift constants. +; TODO: Allow arbitrary shift constants. define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) { ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat(