diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1188,6 +1188,19 @@ Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; + KnownBits Known = computeKnownBits(IIOperand, 0, II); + unsigned BitWidth = Known.getBitWidth(); + + // bswap(x) -> shl(x) if x has at most 8 (byte) active bits + if (Known.countMaxActiveBits() <= 8) + return BinaryOperator::CreateNUWShl( + IIOperand, ConstantInt::get(IIOperand->getType(), BitWidth - 8)); + + // bswap(x) -> lshr(x) if x has at most 8 (byte) active high bits + if (BitWidth - Known.countMinTrailingZeros() <= 8) + return BinaryOperator::CreateExactLShr( + IIOperand, ConstantInt::get(IIOperand->getType(), BitWidth - 8)); + // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) { unsigned C = X->getType()->getScalarSizeInBits() - diff --git a/llvm/test/Transforms/InstCombine/bswap-fold.ll b/llvm/test/Transforms/InstCombine/bswap-fold.ll --- a/llvm/test/Transforms/InstCombine/bswap-fold.ll +++ b/llvm/test/Transforms/InstCombine/bswap-fold.ll @@ -355,6 +355,185 @@ ret i64 %t3 } + +define i64 @bs_active_high8(i64 %0) { +; CHECK-LABEL: @bs_active_high8( +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP0:%.*]], 255 +; CHECK-NEXT: ret i64 [[TMP2]] +; + %2 = shl i64 %0, 56 + %3 = call i64 @llvm.bswap.i64(i64 %2) + ret i64 %3 +} + +define i32 @bs_active_high7(i32 %0) { +; CHECK-LABEL: @bs_active_high7( +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 254 +; CHECK-NEXT: ret i32 [[TMP3]] +; + %2 = and i32 %0, -33554432 ; 0xfe000000 + %3 = call i32 @llvm.bswap.i32(i32 %2) + ret i32 %3 +} + +define <2 x i64> @bs_active_high4(<2 x i64> %0) { +; CHECK-LABEL: @bs_active_high4( +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %2 = shl <2 x i64> %0, + %3 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %2) + ret <2 x i64> %3 +} + +define <2 x i64> @bs_active_high_different(<2 x i64> %0) { +; CHECK-LABEL: @bs_active_high_different( +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = lshr exact <2 x i64> [[TMP2]], +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %2 = shl <2 x i64> %0, + %3 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %2) + ret <2 x i64> %3 +} + +; negative test +define <2 x i64> @bs_active_high_different_negative(<2 x i64> %0) { +; CHECK-LABEL: @bs_active_high_different_negative( +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]]) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %2 = shl <2 x i64> %0, ; second elem has 9 active high bits + %3 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %2) + ret <2 x i64> %3 +} + +; negative test +define <2 x i64> @bs_active_high_undef(<2 x i64> %0) { +; CHECK-LABEL: @bs_active_high_undef( +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP2]]) +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %2 = shl <2 x i64> %0, + %3 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %2) + ret <2 x i64> %3 +} + +define i64 @bs_active_high8_multiuse(i64 %0) { +; CHECK-LABEL: @bs_active_high8_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 56 +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP0]], 255 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %2 = shl i64 %0, 56 + %3 = call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %2, %3 ; increase use of shl and bswap + ret i64 %4 +} + +define i64 @bs_active_high7_multiuse(i64 %0) { +; CHECK-LABEL: @bs_active_high7_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 57 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP3]], 254 +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: ret i64 [[TMP5]] +; + %2 = shl i64 %0, 57 + %3 = call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %2, %3 ; increase use of shl and bswap + ret i64 %4 +} + + +define i16 @bs_active_low1(i16 %0) { +; CHECK-LABEL: @bs_active_low1( +; CHECK-NEXT: [[TMP2:%.*]] = lshr i16 [[TMP0:%.*]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP2]], 256 +; CHECK-NEXT: ret i16 [[TMP3]] +; + %2 = lshr i16 %0, 15 + %3 = call i16 @llvm.bswap.i16(i16 %2) + ret i16 %3 +} + +define <2 x i32> @bs_active_low8(<2 x i32> %0) { +; CHECK-LABEL: @bs_active_low8( +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP0:%.*]], +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %2 = and <2 x i32> %0, + %3 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +define <2 x i32> @bs_active_low_different(<2 x i32> %0) { +; CHECK-LABEL: @bs_active_low_different( +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <2 x i32> [[TMP2]], +; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; + %2 = and <2 x i32> %0, + %3 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +; negative test +define <2 x i32> @bs_active_low_different_negative(<2 x i32> %0) { +; CHECK-LABEL: @bs_active_low_different_negative( +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP2]]) +; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; + %2 = and <2 x i32> %0, + %3 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +; negative test +define <2 x i32> @bs_active_low_undef(<2 x i32> %0) { +; CHECK-LABEL: @bs_active_low_undef( +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[TMP0:%.*]], +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP2]]) +; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; + %2 = and <2 x i32> %0, + %3 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %2) + ret <2 x i32> %3 +} + +define i64 @bs_active_low8_multiuse(i64 %0) { +; CHECK-LABEL: @bs_active_low8_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP0:%.*]], 255 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 56 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %2 = and i64 %0, 255 + %3 = call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %2, %3 ; increase use of and and bswap + ret i64 %4 +} + +define i64 @bs_active_low7_multiuse(i64 %0) { +; CHECK-LABEL: @bs_active_low7_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP0:%.*]], 127 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 56 +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %2 = and i64 %0, 127 + %3 = call i64 @llvm.bswap.i64(i64 %2) + %4 = mul i64 %2, %3 ; increase use of and and bswap + ret i64 %4 +} + + declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) declare i64 @llvm.bswap.i64(i64)