Index: llvm/trunk/include/llvm/Transforms/IPO/LowerBitSets.h =================================================================== --- llvm/trunk/include/llvm/Transforms/IPO/LowerBitSets.h +++ llvm/trunk/include/llvm/Transforms/IPO/LowerBitSets.h @@ -48,6 +48,17 @@ return Bits.size() == 1 && Bits[0] == 1; } + bool isAllOnes() const { + for (unsigned I = 0; I != Bits.size() - 1; ++I) + if (Bits[I] != 0xFF) + return false; + + if (BitSize % 8 == 0) + return Bits[Bits.size() - 1] == 0xFF; + + return Bits[Bits.size() - 1] == (1 << (BitSize % 8)) - 1; + } + bool containsGlobalOffset(uint64_t Offset) const; bool containsValue(const DataLayout *DL, Index: llvm/trunk/lib/Transforms/IPO/LowerBitSets.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/LowerBitSets.cpp +++ llvm/trunk/lib/Transforms/IPO/LowerBitSets.cpp @@ -157,6 +157,7 @@ const DataLayout *DL; IntegerType *Int1Ty; + IntegerType *Int8Ty; IntegerType *Int32Ty; Type *Int32PtrTy; IntegerType *Int64Ty; @@ -173,7 +174,7 @@ const DenseMap &GlobalLayout); Value *createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal, Value *BitOffset); - void + Value * lowerBitSetCall(CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal, const DenseMap &GlobalLayout); @@ -203,6 +204,7 @@ report_fatal_error("Data layout required"); Int1Ty = Type::getInt1Ty(M.getContext()); + Int8Ty = Type::getInt8Ty(M.getContext()); Int32Ty = Type::getInt32Ty(M.getContext()); Int32PtrTy = PointerType::getUnqual(Int32Ty); Int64Ty = Type::getInt64Ty(M.getContext()); @@ -293,19 +295,16 @@ } } -/// Lower a llvm.bitset.test call to its implementation. -void LowerBitSets::lowerBitSetCall( +/// Lower a llvm.bitset.test call to its implementation. Returns the value to +/// replace the call with. +Value *LowerBitSets::lowerBitSetCall( CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal, const DenseMap &GlobalLayout) { Value *Ptr = CI->getArgOperand(0); - if (BSI.containsValue(DL, GlobalLayout, Ptr)) { - CI->replaceAllUsesWith( - ConstantInt::getTrue(BitSetGlobal->getParent()->getContext())); - CI->eraseFromParent(); - return; - } + if (BSI.containsValue(DL, GlobalLayout, Ptr)) + return ConstantInt::getTrue(BitSetGlobal->getParent()->getContext()); Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy); Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd( @@ -317,12 +316,8 @@ Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy); - if (BSI.isSingleOffset()) { - Value *Eq = B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); - CI->replaceAllUsesWith(Eq); - CI->eraseFromParent(); - return; - } + if (BSI.isSingleOffset()) + return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt); Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt); @@ -349,6 +344,10 @@ Constant *BitSizeConst = ConstantInt::get(IntPtrTy, BSI.BitSize); Value *OffsetInRange = B.CreateICmpULT(BitOffset, BitSizeConst); + // If the bit set is all ones, testing against it is unnecessary. + if (BSI.isAllOnes()) + return OffsetInRange; + TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false); IRBuilder<> ThenB(Term); @@ -363,9 +362,7 @@ PHINode *P = B.CreatePHI(Int1Ty, 2); P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB); P->addIncoming(Bit, ThenB.GetInsertBlock()); - - CI->replaceAllUsesWith(P); - CI->eraseFromParent(); + return P; } /// Given a disjoint set of bitsets and globals, layout the globals, build the @@ -376,8 +373,24 @@ const std::vector &Globals) { // Build a new global with the combined contents of the referenced globals. std::vector GlobalInits; - for (GlobalVariable *G : Globals) + for (GlobalVariable *G : Globals) { GlobalInits.push_back(G->getInitializer()); + uint64_t InitSize = DL->getTypeAllocSize(G->getInitializer()->getType()); + + // Compute the amount of padding required to align the next element to the + // next power of 2. + uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; + + // Cap at 128 was found experimentally to have a good data/instruction + // overhead tradeoff. + if (Padding > 128) + Padding = RoundUpToAlignment(InitSize, 128) - InitSize; + + GlobalInits.push_back( + ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); + } + if (!GlobalInits.empty()) + GlobalInits.pop_back(); Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits); auto CombinedGlobal = new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true, @@ -389,7 +402,8 @@ // Compute the offsets of the original globals within the new global. DenseMap GlobalLayout; for (unsigned I = 0; I != Globals.size(); ++I) - GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I); + // Multiply by 2 to account for padding elements. + GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); // For each bitset in this disjoint set... for (MDString *BS : BitSets) { @@ -406,7 +420,10 @@ // Lower each call to llvm.bitset.test for this bitset. for (CallInst *CI : BitSetTestCallSites[BS]) { ++NumBitSetCallsLowered; - lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout); + Value *Lowered = + lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); } } @@ -414,8 +431,9 @@ // global from which we built the combined global, and replace references // to the original globals with references to the aliases. for (unsigned I = 0; I != Globals.size(); ++I) { + // Multiply by 2 to account for padding elements. Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0), - ConstantInt::get(Int32Ty, I)}; + ConstantInt::get(Int32Ty, I * 2)}; Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(CombinedGlobal, CombinedGlobalIdxs); GlobalAlias *GAlias = GlobalAlias::create( Index: llvm/trunk/test/Transforms/LowerBitSets/layout.ll =================================================================== --- llvm/trunk/test/Transforms/LowerBitSets/layout.ll +++ llvm/trunk/test/Transforms/LowerBitSets/layout.ll @@ -6,7 +6,7 @@ ; (see GlobalLayoutBuilder in include/llvm/Transforms/IPO/LowerBitSets.h). ; The chosen layout in this case is a, e, b, d, c. -; CHECK: private constant { i32, i32, i32, i32, i32 } { i32 1, i32 5, i32 2, i32 4, i32 3 } +; CHECK: private constant { i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 5, [0 x i8] zeroinitializer, i32 2, [0 x i8] zeroinitializer, i32 4, [0 x i8] zeroinitializer, i32 3 } @a = constant i32 1 @b = constant i32 2 @c = constant i32 3 Index: llvm/trunk/test/Transforms/LowerBitSets/simple.ll =================================================================== --- llvm/trunk/test/Transforms/LowerBitSets/simple.ll +++ llvm/trunk/test/Transforms/LowerBitSets/simple.ll @@ -3,14 +3,14 @@ target datalayout = "e-p:32:32" -; CHECK: [[G:@[^ ]*]] = private constant { i32, [63 x i32], i32, [2 x i32] } { i32 1, [63 x i32] zeroinitializer, i32 3, [2 x i32] [i32 4, i32 5] } +; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] } { i32 1, [0 x i8] zeroinitializer, [63 x i32] zeroinitializer, [4 x i8] zeroinitializer, i32 3, [0 x i8] zeroinitializer, [2 x i32] [i32 4, i32 5] } @a = constant i32 1 @b = constant [63 x i32] zeroinitializer @c = constant i32 3 @d = constant [2 x i32] [i32 4, i32 5] ; Offset 0, 4 byte alignment -; CHECK: @bitset1.bits = private constant [9 x i8] c"\03\00\00\00\00\00\00\00\04" +; CHECK: @bitset1.bits = private constant [9 x i8] c"\03\00\00\00\00\00\00\00\08" !0 = !{!"bitset1", i32* @a, i32 0} ; CHECK-NODISCARD-DAG: !{!"bitset1", i32* @a, i32 0} !1 = !{!"bitset1", [63 x i32]* @b, i32 0} @@ -18,15 +18,15 @@ !2 = !{!"bitset1", [2 x i32]* @d, i32 4} ; CHECK-NODISCARD-DAG: !{!"bitset1", [2 x i32]* @d, i32 4} -; Offset 4, 4 byte alignment -; CHECK: @bitset2.bits = private constant [8 x i8] c"\01\00\00\00\00\00\00\80" +; Offset 4, 256 byte alignment +; CHECK: @bitset2.bits = private constant [1 x i8] c"\03" !3 = !{!"bitset2", [63 x i32]* @b, i32 0} ; CHECK-NODISCARD-DAG: !{!"bitset2", [63 x i32]* @b, i32 0} !4 = !{!"bitset2", i32* @c, i32 0} ; CHECK-NODISCARD-DAG: !{!"bitset2", i32* @c, i32 0} -; Offset 0, 256 byte alignment -; CHECK: @bitset3.bits = private constant [1 x i8] c"\03" +; Offset 0, 4 byte alignment +; CHECK: @bitset3.bits = private constant [9 x i8] c"\01\00\00\00\00\00\00\00\02" !5 = !{!"bitset3", i32* @a, i32 0} ; CHECK-NODISCARD-DAG: !{!"bitset3", i32* @a, i32 0} !6 = !{!"bitset3", i32* @c, i32 0} @@ -38,10 +38,10 @@ !llvm.bitsets = !{ !0, !1, !2, !3, !4, !5, !6, !7 } -; CHECK: @a = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 0) -; CHECK: @b = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 1) -; CHECK: @c = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 2) -; CHECK: @d = alias getelementptr inbounds ({ i32, [63 x i32], i32, [2 x i32] }* [[G]], i32 0, i32 3) +; CHECK: @a = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 0) +; CHECK: @b = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 2) +; CHECK: @c = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 4) +; CHECK: @d = alias getelementptr inbounds ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]], i32 0, i32 6) declare i1 @llvm.bitset.test(i8* %ptr, metadata %bitset) nounwind readnone @@ -52,11 +52,11 @@ ; CHECK: [[R0:%[^ ]*]] = bitcast i32* [[A0]] to i8* %pi8 = bitcast i32* %p to i8* ; CHECK: [[R1:%[^ ]*]] = ptrtoint i8* [[R0]] to i32 - ; CHECK: [[R2:%[^ ]*]] = sub i32 [[R1]], ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32) + ; CHECK: [[R2:%[^ ]*]] = sub i32 [[R1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32) ; CHECK: [[R3:%[^ ]*]] = lshr i32 [[R2]], 2 ; CHECK: [[R4:%[^ ]*]] = shl i32 [[R2]], 30 ; CHECK: [[R5:%[^ ]*]] = or i32 [[R3]], [[R4]] - ; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 67 + ; CHECK: [[R6:%[^ ]*]] = icmp ult i32 [[R5]], 68 ; CHECK: br i1 [[R6]] ; CHECK: [[R8:%[^ ]*]] = lshr i32 [[R5]], 5 @@ -82,22 +82,14 @@ ; CHECK: [[S0:%[^ ]*]] = bitcast i32* [[B0]] to i8* %pi8 = bitcast i32* %p to i8* ; CHECK: [[S1:%[^ ]*]] = ptrtoint i8* [[S0]] to i32 - ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32), i32 4) - ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 2 - ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 30 + ; CHECK: [[S2:%[^ ]*]] = sub i32 [[S1]], add (i32 ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32), i32 4) + ; CHECK: [[S3:%[^ ]*]] = lshr i32 [[S2]], 8 + ; CHECK: [[S4:%[^ ]*]] = shl i32 [[S2]], 24 ; CHECK: [[S5:%[^ ]*]] = or i32 [[S3]], [[S4]] - ; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 64 - ; CHECK: br i1 [[S6]] - - ; CHECK: [[S8:%[^ ]*]] = zext i32 [[S5]] to i64 - ; CHECK: [[S9:%[^ ]*]] = and i64 [[S8]], 63 - ; CHECK: [[S10:%[^ ]*]] = shl i64 1, [[S9]] - ; CHECK: [[S11:%[^ ]*]] = and i64 -9223372036854775807, [[S10]] - ; CHECK: [[S12:%[^ ]*]] = icmp ne i64 [[S11]], 0 - - ; CHECK: [[S16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[S12]], {{%[^ ]*}} ] + ; CHECK: [[S6:%[^ ]*]] = icmp ult i32 [[S5]], 2 %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset2") - ; CHECK: ret i1 [[S16]] + + ; CHECK: ret i1 [[S6]] ret i1 %x } @@ -106,19 +98,22 @@ ; CHECK: [[T0:%[^ ]*]] = bitcast i32* [[C0]] to i8* %pi8 = bitcast i32* %p to i8* ; CHECK: [[T1:%[^ ]*]] = ptrtoint i8* [[T0]] to i32 - ; CHECK: [[T2:%[^ ]*]] = sub i32 [[T1]], ptrtoint ({ i32, [63 x i32], i32, [2 x i32] }* [[G]] to i32) - ; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 8 - ; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 24 + ; CHECK: [[T2:%[^ ]*]] = sub i32 [[T1]], ptrtoint ({ i32, [0 x i8], [63 x i32], [4 x i8], i32, [0 x i8], [2 x i32] }* [[G]] to i32) + ; CHECK: [[T3:%[^ ]*]] = lshr i32 [[T2]], 2 + ; CHECK: [[T4:%[^ ]*]] = shl i32 [[T2]], 30 ; CHECK: [[T5:%[^ ]*]] = or i32 [[T3]], [[T4]] - ; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 2 + ; CHECK: [[T6:%[^ ]*]] = icmp ult i32 [[T5]], 66 ; CHECK: br i1 [[T6]] - ; CHECK: [[T8:%[^ ]*]] = and i32 [[T5]], 31 - ; CHECK: [[T9:%[^ ]*]] = shl i32 1, [[T8]] - ; CHECK: [[T10:%[^ ]*]] = and i32 3, [[T9]] - ; CHECK: [[T11:%[^ ]*]] = icmp ne i32 [[T10]], 0 + ; CHECK: [[T8:%[^ ]*]] = lshr i32 [[T5]], 5 + ; CHECK: [[T9:%[^ ]*]] = getelementptr i32* bitcast ([9 x i8]* @bitset3.bits to i32*), i32 [[T8]] + ; CHECK: [[T10:%[^ ]*]] = load i32* [[T9]] + ; CHECK: [[T11:%[^ ]*]] = and i32 [[T5]], 31 + ; CHECK: [[T12:%[^ ]*]] = shl i32 1, [[T11]] + ; CHECK: [[T13:%[^ ]*]] = and i32 [[T10]], [[T12]] + ; CHECK: [[T14:%[^ ]*]] = icmp ne i32 [[T13]], 0 - ; CHECK: [[T16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[T11]], {{%[^ ]*}} ] + ; CHECK: [[T16:%[^ ]*]] = phi i1 [ false, {{%[^ ]*}} ], [ [[T14]], {{%[^ ]*}} ] %x = call i1 @llvm.bitset.test(i8* %pi8, metadata !"bitset3") ; CHECK: ret i1 [[T16]] ret i1 %x Index: llvm/trunk/test/Transforms/LowerBitSets/single-offset.ll =================================================================== --- llvm/trunk/test/Transforms/LowerBitSets/single-offset.ll +++ llvm/trunk/test/Transforms/LowerBitSets/single-offset.ll @@ -2,7 +2,7 @@ target datalayout = "e-p:32:32" -; CHECK: [[G:@[^ ]*]] = private constant { i32, i32 } +; CHECK: [[G:@[^ ]*]] = private constant { i32, [0 x i8], i32 } @a = constant i32 1 @b = constant i32 2 @@ -18,7 +18,7 @@ ; CHECK: @foo(i8* [[A0:%[^ ]*]]) define i1 @foo(i8* %p) { ; CHECK: [[R0:%[^ ]*]] = ptrtoint i8* [[A0]] to i32 - ; CHECK: [[R1:%[^ ]*]] = icmp eq i32 [[R0]], ptrtoint ({ i32, i32 }* [[G]] to i32) + ; CHECK: [[R1:%[^ ]*]] = icmp eq i32 [[R0]], ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32) %x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset2") ; CHECK: ret i1 [[R1]] ret i1 %x @@ -27,7 +27,7 @@ ; CHECK: @bar(i8* [[B0:%[^ ]*]]) define i1 @bar(i8* %p) { ; CHECK: [[S0:%[^ ]*]] = ptrtoint i8* [[B0]] to i32 - ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, i32 }* [[G]] to i32), i32 4) + ; CHECK: [[S1:%[^ ]*]] = icmp eq i32 [[S0]], add (i32 ptrtoint ({ i32, [0 x i8], i32 }* [[G]] to i32), i32 4) %x = call i1 @llvm.bitset.test(i8* %p, metadata !"bitset3") ; CHECK: ret i1 [[S1]] ret i1 %x Index: llvm/trunk/unittests/Transforms/IPO/LowerBitSets.cpp =================================================================== --- llvm/trunk/unittests/Transforms/IPO/LowerBitSets.cpp +++ llvm/trunk/unittests/Transforms/IPO/LowerBitSets.cpp @@ -20,19 +20,22 @@ uint64_t BitSize; unsigned AlignLog2; bool IsSingleOffset; + bool IsAllOnes; } BSBTests[] = { - {{}, {0}, 0, 1, 0, false}, - {{0}, {1}, 0, 1, 0, true}, - {{4}, {1}, 4, 1, 0, true}, - {{37}, {1}, 37, 1, 0, true}, - {{0, 1}, {3}, 0, 2, 0, false}, - {{0, 4}, {3}, 0, 2, 2, false}, - {{0, uint64_t(1) << 33}, {3}, 0, 2, 33, false}, - {{3, 7}, {3}, 3, 2, 2, false}, - {{0, 1, 7}, {131}, 0, 8, 0, false}, - {{0, 2, 14}, {131}, 0, 8, 1, false}, - {{0, 1, 8}, {3, 1}, 0, 9, 0, false}, - {{0, 2, 16}, {3, 1}, 0, 9, 1, false}, + {{}, {0}, 0, 1, 0, false, false}, + {{0}, {1}, 0, 1, 0, true, true}, + {{4}, {1}, 4, 1, 0, true, true}, + {{37}, {1}, 37, 1, 0, true, true}, + {{0, 1}, {3}, 0, 2, 0, false, true}, + {{0, 4}, {3}, 0, 2, 2, false, true}, + {{0, uint64_t(1) << 33}, {3}, 0, 2, 33, false, true}, + {{3, 7}, {3}, 3, 2, 2, false, true}, + {{0, 1, 7}, {131}, 0, 8, 0, false, false}, + {{0, 2, 14}, {131}, 0, 8, 1, false, false}, + {{0, 1, 8}, {3, 1}, 0, 9, 0, false, false}, + {{0, 2, 16}, {3, 1}, 0, 9, 1, false, false}, + {{0, 1, 2, 3, 4, 5, 6, 7}, {255}, 0, 8, 0, false, true}, + {{0, 1, 2, 3, 4, 5, 6, 7, 8}, {255, 1}, 0, 9, 0, false, true}, }; for (auto &&T : BSBTests) { @@ -47,6 +50,7 @@ EXPECT_EQ(T.BitSize, BSI.BitSize); EXPECT_EQ(T.AlignLog2, BSI.AlignLog2); EXPECT_EQ(T.IsSingleOffset, BSI.isSingleOffset()); + EXPECT_EQ(T.IsAllOnes, BSI.isAllOnes()); for (auto Offset : T.Offsets) EXPECT_TRUE(BSI.containsGlobalOffset(Offset));