Index: include/llvm/Analysis/DemandedBits.h =================================================================== --- include/llvm/Analysis/DemandedBits.h +++ include/llvm/Analysis/DemandedBits.h @@ -44,6 +44,14 @@ F(F), AC(AC), DT(DT) {} /// Return the bits demanded from instruction I. + /// + /// For vector instructions individual vector elements are not distinguished: + /// A bit is demanded if it is demanded for any of the vector elements. The + /// size of the return value corresponds to the type size in bits of the + /// scalar type. + /// + /// Instructions that do not have integer or vector of integer type are + /// accepted, but will always produce a mask with all bits set. APInt getDemandedBits(Instruction *I); /// Return true if, during analysis, I could not be reached. Index: lib/Analysis/DemandedBits.cpp =================================================================== --- lib/Analysis/DemandedBits.cpp +++ lib/Analysis/DemandedBits.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/Pass.h" @@ -50,6 +51,7 @@ #include using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "demanded-bits" @@ -143,17 +145,17 @@ } break; case Intrinsic::fshl: - case Intrinsic::fshr: + case Intrinsic::fshr: { + const APInt *SA; if (OperandNo == 2) { // Shift amount is modulo the bitwidth. For powers of two we have // SA % BW == SA & (BW - 1). if (isPowerOf2_32(BitWidth)) AB = BitWidth - 1; - } else if (auto *SA = dyn_cast(II->getOperand(2))) { - // TODO: Support vectors. + } else if (match(II->getOperand(2), m_APInt(SA))) { // Normalize to funnel shift left. APInt shifts of BitWidth are well- // defined, so no need to special-case zero shifts here. - uint64_t ShiftAmt = SA->getValue().urem(BitWidth); + uint64_t ShiftAmt = SA->urem(BitWidth); if (II->getIntrinsicID() == Intrinsic::fshr) ShiftAmt = BitWidth - ShiftAmt; @@ -164,6 +166,7 @@ } break; } + } break; case Instruction::Add: case Instruction::Sub: @@ -174,8 +177,9 @@ AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); break; case Instruction::Shl: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.lshr(ShiftAmt); @@ -187,10 +191,12 @@ else if (S->hasNoUnsignedWrap()) AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::LShr: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.shl(ShiftAmt); @@ -199,10 +205,12 @@ if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::AShr: - if (OperandNo == 0) - if (auto *ShiftAmtC = dyn_cast(UserI->getOperand(1))) { + if (OperandNo == 0) { + const APInt *ShiftAmtC; + if (match(UserI->getOperand(1), m_APInt(ShiftAmtC))) { uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1); AB = AOut.shl(ShiftAmt); // Because the high input bit is replicated into the @@ -217,6 +225,7 @@ if (cast(UserI)->isExact()) AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); } + } break; case Instruction::And: AB = AOut; @@ -274,6 +283,15 @@ if (OperandNo != 0) AB = AOut; break; + case Instruction::ExtractElement: + if (OperandNo == 0) + AB = AOut; + break; + case Instruction::InsertElement: + case Instruction::ShuffleVector: + if (OperandNo == 0 || OperandNo == 1) + AB = AOut; + break; } } @@ -309,8 +327,9 @@ // bits and add the instruction to the work list. For other instructions // add their operands to the work list (for integer values operands, mark // all bits as live). - if (IntegerType *IT = dyn_cast(I.getType())) { - if (AliveBits.try_emplace(&I, IT->getBitWidth(), 0).second) + Type *T = I.getType(); + if (T->isIntOrIntVectorTy()) { + if (AliveBits.try_emplace(&I, T->getScalarSizeInBits(), 0).second) Worklist.push_back(&I); continue; @@ -319,8 +338,9 @@ // Non-integer-typed instructions... for (Use &OI : I.operands()) { if (Instruction *J = dyn_cast(OI)) { - if (IntegerType *IT = dyn_cast(J->getType())) - AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); + Type *T = J->getType(); + if (T->isIntOrIntVectorTy()) + AliveBits[J] = APInt::getAllOnesValue(T->getScalarSizeInBits()); Worklist.push_back(J); } } @@ -336,13 +356,13 @@ LLVM_DEBUG(dbgs() << "DemandedBits: Visiting: " << *UserI); APInt AOut; - if (UserI->getType()->isIntegerTy()) { + if (UserI->getType()->isIntOrIntVectorTy()) { AOut = AliveBits[UserI]; LLVM_DEBUG(dbgs() << " Alive Out: " << AOut); } LLVM_DEBUG(dbgs() << "\n"); - if (!UserI->getType()->isIntegerTy()) + if (!UserI->getType()->isIntOrIntVectorTy()) Visited.insert(UserI); KnownBits Known, Known2; @@ -351,10 +371,11 @@ // operand is added to the work-list. for (Use &OI : UserI->operands()) { if (Instruction *I = dyn_cast(OI)) { - if (IntegerType *IT = dyn_cast(I->getType())) { - unsigned BitWidth = IT->getBitWidth(); + Type *T = I->getType(); + if (T->isIntOrIntVectorTy()) { + unsigned BitWidth = T->getScalarSizeInBits(); APInt AB = APInt::getAllOnesValue(BitWidth); - if (UserI->getType()->isIntegerTy() && !AOut && + if (UserI->getType()->isIntOrIntVectorTy() && !AOut && !isAlwaysLive(UserI)) { AB = APInt(BitWidth, 0); } else { @@ -389,11 +410,13 @@ APInt DemandedBits::getDemandedBits(Instruction *I) { performAnalysis(); - const DataLayout &DL = I->getModule()->getDataLayout(); auto Found = AliveBits.find(I); if (Found != AliveBits.end()) return Found->second; - return APInt::getAllOnesValue(DL.getTypeSizeInBits(I->getType())); + + const DataLayout &DL = I->getModule()->getDataLayout(); + return APInt::getAllOnesValue( + DL.getTypeSizeInBits(I->getType()->getScalarType())); } bool DemandedBits::isInstructionDead(Instruction *I) { Index: lib/Transforms/Scalar/BDCE.cpp =================================================================== --- lib/Transforms/Scalar/BDCE.cpp +++ lib/Transforms/Scalar/BDCE.cpp @@ -38,7 +38,8 @@ /// instruction may need to be cleared of assumptions that can no longer be /// guaranteed correct. static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { - assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?"); + assert(I->getType()->isIntOrIntVectorTy() && + "Trivializing a non-integer value?"); // Initialize the worklist with eligible direct users. SmallVector WorkList; @@ -46,13 +47,13 @@ // If all bits of a user are demanded, then we know that nothing below that // in the def-use chain needs to be changed. auto *J = dyn_cast(JU); - if (J && J->getType()->isSized() && + if (J && J->getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(J).isAllOnesValue()) WorkList.push_back(J); - // Note that we need to check for unsized types above before asking for + // Note that we need to check for non-int types above before asking for // demanded bits. Normally, the only way to reach an instruction with an - // unsized type is via an instruction that has side effects (or otherwise + // non-int type is via an instruction that has side effects (or otherwise // will demand its input bits). However, if we have a readnone function // that returns an unsized type (e.g., void), we must avoid asking for the // demanded bits of the function call's return value. A void-returning @@ -78,7 +79,7 @@ // If all bits of a user are demanded, then we know that nothing below // that in the def-use chain needs to be changed. auto *K = dyn_cast(KU); - if (K && !Visited.count(K) && K->getType()->isSized() && + if (K && !Visited.count(K) && K->getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(K).isAllOnesValue()) WorkList.push_back(K); } @@ -95,7 +96,7 @@ if (I.mayHaveSideEffects() && I.use_empty()) continue; - if (I.getType()->isIntegerTy() && + if (I.getType()->isIntOrIntVectorTy() && !DB.getDemandedBits(&I).getBoolValue()) { // For live instructions that have all dead bits, first make them dead by // replacing all uses with something else. Then, if they don't need to Index: test/Analysis/DemandedBits/vectors.ll =================================================================== --- /dev/null +++ test/Analysis/DemandedBits/vectors.ll @@ -0,0 +1,136 @@ +; RUN: opt -S -demanded-bits -analyze < %s | FileCheck %s +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s + +; CHECK-DAG: DemandedBits: 0xff00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff00 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff00 for %z = or <2 x i32> %x, %y +; CHECK-DAG: DemandedBits: 0xff for %u = lshr <2 x i32> %z, +; CHECK-DAG: DemandedBits: 0xff for %r = trunc <2 x i32> %u to <2 x i8> +define <2 x i8> @test_basic(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = or <2 x i32> %x, %y + %u = lshr <2 x i32> %z, + %r = trunc <2 x i32> %u to <2 x i8> + ret <2 x i8> %r +} + +; Vector-specific instructions + +; CHECK-DAG: DemandedBits: 0xff for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %z = extractelement <2 x i32> %x, i32 1 +; CHECK-DAG: DemandedBits: 0xf for %y = extractelement <2 x i32> %x, i32 0 +; CHECK-DAG: DemandedBits: 0xffffffff for %u = and i32 %y, 15 +; CHECK-DAG: DemandedBits: 0xffffffff for %v = and i32 %z, 240 +; CHECK-DAG: DemandedBits: 0xffffffff for %r = or i32 %u, %v +define i32 @test_extractelement(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = extractelement <2 x i32> %x, i32 0 + %z = extractelement <2 x i32> %x, i32 1 + %u = and i32 %y, 15 + %v = and i32 %z, 240 + %r = or i32 %u, %v + ret i32 %r +} + +; CHECK-DAG: DemandedBits: 0xff for %x = or i32 %a, 0 +; CHECK-DAG: DemandedBits: 0xff for %y = or i32 %b, 0 +; CHECK-DAG: DemandedBits: 0xff for %z = insertelement <2 x i32> undef, i32 %x, i32 0 +; CHECK-DAG: DemandedBits: 0xff for %u = insertelement <2 x i32> %z, i32 %y, i32 1 +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %u, +define <2 x i32> @test_insertelement(i32 %a, i32 %b) { + %x = or i32 %a, 0 + %y = or i32 %b, 0 + %z = insertelement <2 x i32> undef, i32 %x, i32 0 + %u = insertelement <2 x i32> %z, i32 %y, i32 1 + %r = and <2 x i32> %u, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xff for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <3 x i32> %z, +define <3 x i32> @test_shufflevector(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = shufflevector <2 x i32> %x, <2 x i32> %y, <3 x i32> + %r = and <3 x i32> %z, + ret <3 x i32> %r +} + +; Shifts with splat shift amounts + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = shl <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_shl(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = shl <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = ashr <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_ashr(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = ashr <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf00 for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0 for %y = lshr <2 x i32> %x, +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %y, +define <2 x i32> @test_lshr(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %y = lshr <2 x i32> %x, + %r = and <2 x i32> %y, + ret <2 x i32> %r +} + +declare <2 x i32> @llvm.fshl.i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.fshr.i32(<2 x i32>, <2 x i32>, <2 x i32>) + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0000000 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %z, +define <2 x i32> @test_fshl(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = call <2 x i32> @llvm.fshl.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) + %r = and <2 x i32> %z, + ret <2 x i32> %r +} + +; CHECK-DAG: DemandedBits: 0xf for %x = or <2 x i32> %a, zeroinitializer +; CHECK-DAG: DemandedBits: 0xf0000000 for %y = or <2 x i32> %b, zeroinitializer +; CHECK-DAG: DemandedBits: 0xff for %z = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) +; CHECK-DAG: DemandedBits: 0xffffffff for %r = and <2 x i32> %z, +define <2 x i32> @test_fshr(<2 x i32> %a, <2 x i32> %b) { + %x = or <2 x i32> %a, zeroinitializer + %y = or <2 x i32> %b, zeroinitializer + %z = call <2 x i32> @llvm.fshr.i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) + %r = and <2 x i32> %z, + ret <2 x i32> %r +} + +; FP / Int conversion. These have different input / output types. + +; CHECK-DAG: DemandedBits: 0xffffffff for %x = or <2 x i32> %a, zeroinitializer +define <2 x float> @test_uitofp(<2 x i32> %a) { + %x = or <2 x i32> %a, zeroinitializer + %r = uitofp <2 x i32> %x to <2 x float> + ret <2 x float> %r +} + +; CHECK-DAG: DemandedBits: 0xffffffff for %y = fptoui <2 x float> %x to <2 x i32> +define <2 x i32> @test_fptoui(<2 x float> %a) { + %x = fadd <2 x float> %a, + %y = fptoui <2 x float> %x to <2 x i32> + %r = and <2 x i32> %y, + ret <2 x i32> %y +} Index: test/Transforms/BDCE/vectors.ll =================================================================== --- test/Transforms/BDCE/vectors.ll +++ test/Transforms/BDCE/vectors.ll @@ -7,12 +7,9 @@ define <2 x i32> @test_basic(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @test_basic( -; CHECK-NEXT: [[A2:%.*]] = add <2 x i32> [[A:%.*]], -; CHECK-NEXT: [[A3:%.*]] = and <2 x i32> [[A2]], ; CHECK-NEXT: [[B2:%.*]] = add <2 x i32> [[B:%.*]], ; CHECK-NEXT: [[B3:%.*]] = and <2 x i32> [[B2]], -; CHECK-NEXT: [[C:%.*]] = or <2 x i32> [[A3]], [[B3]] -; CHECK-NEXT: [[D:%.*]] = ashr <2 x i32> [[C]], +; CHECK-NEXT: [[D:%.*]] = ashr <2 x i32> [[B3]], ; CHECK-NEXT: ret <2 x i32> [[D]] ; ; CHECK-IO-LABEL: @test_basic( @@ -36,12 +33,9 @@ ; Going vector -> scalar define i32 @test_extractelement(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @test_extractelement( -; CHECK-NEXT: [[A2:%.*]] = add <2 x i32> [[A:%.*]], -; CHECK-NEXT: [[A3:%.*]] = and <2 x i32> [[A2]], ; CHECK-NEXT: [[B2:%.*]] = add <2 x i32> [[B:%.*]], ; CHECK-NEXT: [[B3:%.*]] = and <2 x i32> [[B2]], -; CHECK-NEXT: [[C:%.*]] = or <2 x i32> [[A3]], [[B3]] -; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i32> [[C]], i32 0 +; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i32> [[B3]], i32 0 ; CHECK-NEXT: [[E:%.*]] = ashr i32 [[D]], 3 ; CHECK-NEXT: ret i32 [[E]] ; @@ -68,14 +62,10 @@ ; Going scalar -> vector define <2 x i32> @test_insertelement(i32 %a, i32 %b) { ; CHECK-LABEL: @test_insertelement( -; CHECK-NEXT: [[X:%.*]] = insertelement <2 x i32> undef, i32 [[A:%.*]], i32 0 -; CHECK-NEXT: [[X2:%.*]] = insertelement <2 x i32> [[X]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: [[X3:%.*]] = and <2 x i32> [[X2]], -; CHECK-NEXT: [[Y:%.*]] = insertelement <2 x i32> undef, i32 [[B]], i32 0 -; CHECK-NEXT: [[Y2:%.*]] = insertelement <2 x i32> [[Y]], i32 [[A]], i32 1 +; CHECK-NEXT: [[Y:%.*]] = insertelement <2 x i32> undef, i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[Y2:%.*]] = insertelement <2 x i32> [[Y]], i32 [[A:%.*]], i32 1 ; CHECK-NEXT: [[Y3:%.*]] = and <2 x i32> [[Y2]], -; CHECK-NEXT: [[Z:%.*]] = or <2 x i32> [[X3]], [[Y3]] -; CHECK-NEXT: [[U:%.*]] = ashr <2 x i32> [[Z]], +; CHECK-NEXT: [[U:%.*]] = ashr <2 x i32> [[Y3]], ; CHECK-NEXT: ret <2 x i32> [[U]] ; ; CHECK-IO-LABEL: @test_insertelement( @@ -132,10 +122,8 @@ ; Assumption invalidation (adapted from invalidate-assumptions.ll) define <2 x i1> @test_assumption_invalidation(<2 x i1> %b, <2 x i8> %x) { ; CHECK-LABEL: @test_assumption_invalidation( -; CHECK-NEXT: [[SETBIT:%.*]] = or <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[LITTLE_NUMBER:%.*]] = zext <2 x i1> [[B:%.*]] to <2 x i8> -; CHECK-NEXT: [[BIG_NUMBER:%.*]] = shl <2 x i8> [[SETBIT]], -; CHECK-NEXT: [[SUB:%.*]] = sub nuw <2 x i8> [[BIG_NUMBER]], [[LITTLE_NUMBER]] +; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i8> zeroinitializer, [[LITTLE_NUMBER]] ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i8> [[SUB]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[TRUNC]] ; Index: test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll @@ -0,0 +1,20 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; getDemandedBits() is called on the pointer-typed GEP instruction here. +; Only make sure we do not crash. + +; CHECK: @test +define void @test(i8* %ptr, i8* %ptr_end) { +start: + br label %loop + +loop: + %ptr2 = phi i8* [ %ptr3, %loop ], [ %ptr, %start ] + %x = sext i8 undef to i64 + %ptr3 = getelementptr inbounds i8, i8* %ptr2, i64 1 + %cmp = icmp ult i8* %ptr3, %ptr_end + br i1 %cmp, label %loop, label %end + +end: + ret void +}