diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1349,6 +1349,24 @@ Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; + // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as + // inverse-shift-of-bswap: + // bswap (shl X, C) --> lshr (bswap X), C + // bswap (lshr X, C) --> shl (bswap X), C + // TODO: Use knownbits to allow variable shift and non-splat vector match. + BinaryOperator *BO; + if (match(IIOperand, m_OneUse(m_BinOp(BO)))) { + const APInt *C; + if (match(BO, m_LogicalShift(m_Value(X), m_APIntAllowUndef(C))) && + (*C & 7) == 0) { + Value *NewSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X); + BinaryOperator::BinaryOps InverseShift = + BO->getOpcode() == Instruction::Shl ? Instruction::LShr + : Instruction::Shl; + return BinaryOperator::Create(InverseShift, NewSwap, BO->getOperand(1)); + } + } + KnownBits Known = computeKnownBits(IIOperand, 0, II); uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8); uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8); diff --git a/llvm/test/Transforms/InstCombine/bswap-fold.ll b/llvm/test/Transforms/InstCombine/bswap-fold.ll --- a/llvm/test/Transforms/InstCombine/bswap-fold.ll +++ b/llvm/test/Transforms/InstCombine/bswap-fold.ll @@ -26,8 +26,8 @@ define i32 @lshr8_i32(i32 %x) { ; CHECK-LABEL: @lshr8_i32( -; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.bswap.i32(i32 [[S]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shl i32 [[TMP1]], 8 ; CHECK-NEXT: ret i32 [[R]] ; %s = lshr i32 %x, 8 @@ -37,8 +37,8 @@ define <2 x i32> @lshr16_v2i32(<2 x i32> %x) { ; CHECK-LABEL: @lshr16_v2i32( -; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[S]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shl <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; %s = lshr <2 x i32> %x, @@ -48,14 +48,16 @@ define i32 @lshr24_i32(i32 %x) { ; CHECK-LABEL: @lshr24_i32( -; CHECK-NEXT: [[S:%.*]] = and i32 [[X:%.*]], -16777216 -; CHECK-NEXT: ret i32 [[S]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], -16777216 +; CHECK-NEXT: ret i32 [[TMP1]] ; %s = lshr i32 %x, 24 %r = call i32 @llvm.bswap.i32(i32 %s) ret i32 %r } +; negative test - need shift-by-8-bit-multiple + define i32 @lshr12_i32(i32 %x) { ; CHECK-LABEL: @lshr12_i32( ; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 12 @@ -67,6 +69,8 @@ ret i32 %r } +; negative test - uses + define i32 @lshr8_i32_use(i32 %x, i32* %p) { ; CHECK-LABEL: @lshr8_i32_use( ; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 12 @@ -82,8 +86,8 @@ define i64 @shl16_i64(i64 %x) { ; CHECK-LABEL: @shl16_i64( -; CHECK-NEXT: [[S:%.*]] = shl i64 [[X:%.*]], 16 -; CHECK-NEXT: [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = lshr i64 [[TMP1]], 16 ; CHECK-NEXT: ret i64 [[R]] ; %s = shl i64 %x, 16 @@ -91,10 +95,12 @@ ret i64 %r } +; poison vector element propagates + define <2 x i64> @shl16_v2i64(<2 x i64> %x) { ; CHECK-LABEL: @shl16_v2i64( -; CHECK-NEXT: [[S:%.*]] = shl <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[S]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[X:%.*]]) +; CHECK-NEXT: [[R:%.*]] = lshr <2 x i64> [[TMP1]], ; CHECK-NEXT: ret <2 x i64> [[R]] ; %s = shl <2 x i64> %x, @@ -104,14 +110,16 @@ define i64 @shl56_i64(i64 %x) { ; CHECK-LABEL: @shl56_i64( -; CHECK-NEXT: [[S:%.*]] = and i64 [[X:%.*]], 255 -; CHECK-NEXT: ret i64 [[S]] +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 255 +; CHECK-NEXT: ret i64 [[TMP1]] ; %s = shl i64 %x, 56 %r = call i64 @llvm.bswap.i64(i64 %s) ret i64 %r } +; negative test - need shift-by-8-bit-multiple + define i64 @shl42_i64(i64 %x) { ; CHECK-LABEL: @shl42_i64( ; CHECK-NEXT: [[S:%.*]] = shl i64 [[X:%.*]], 42 @@ -123,6 +131,8 @@ ret i64 %r } +; negative test - uses + define i32 @shl8_i32_use(i32 %x, i32* %p) { ; CHECK-LABEL: @shl8_i32_use( ; CHECK-NEXT: [[S:%.*]] = shl i32 [[X:%.*]], 8 @@ -136,11 +146,11 @@ ret i32 %r } +; swaps cancel + define i64 @swap_shl16_i64(i64 %x) { ; CHECK-LABEL: @swap_shl16_i64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.bswap.i64(i64 [[X:%.*]]) -; CHECK-NEXT: [[S:%.*]] = shl i64 [[B]], 16 -; CHECK-NEXT: [[R:%.*]] = call i64 @llvm.bswap.i64(i64 [[S]]) +; CHECK-NEXT: [[R:%.*]] = lshr i64 [[X:%.*]], 16 ; CHECK-NEXT: ret i64 [[R]] ; %b = call i64 @llvm.bswap.i64(i64 %x) @@ -536,11 +546,11 @@ ret <2 x i64> %3 } -; negative test +; TODO: This should fold to 'and'. define <2 x i64> @bs_active_high_undef(<2 x i64> %0) { ; CHECK-LABEL: @bs_active_high_undef( -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP0:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], ; CHECK-NEXT: ret <2 x i64> [[TMP3]] ; %2 = shl <2 x i64> %0,