Index: include/llvm/IR/PatternMatch.h =================================================================== --- include/llvm/IR/PatternMatch.h +++ include/llvm/IR/PatternMatch.h @@ -317,6 +317,9 @@ /// \brief Match a ConstantInt, capturing the value if we match. inline bind_ty m_ConstantInt(ConstantInt *&CI) { return CI; } +/// \brief Match a load instruction, capturing the value if we match. +inline bind_ty m_Load(LoadInst *&LI) { return LI; } + /// \brief Match a Constant, capturing the value if we match. inline bind_ty m_Constant(Constant *&C) { return C; } Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -5709,6 +5709,342 @@ return true; } +/// Check whether mem will be modified between \p LI and \p SI. +static bool memModBetween(LoadInst &LI, StoreInst &SI) { + // Check whether mem is modified inside BB local iterator range. + auto memModInside = [](BasicBlock::const_iterator Start, + BasicBlock::const_iterator End) -> bool { + for (const Instruction &Inst : + iterator_range(Start, End)) + if (Inst.mayWriteToMemory()) + return true; + return false; + }; + const BasicBlock *LIBB = LI.getParent(); + const BasicBlock *SIBB = SI.getParent(); + BasicBlock::const_iterator LII(LI); + BasicBlock::const_iterator SII(SI); + if (LIBB == SIBB) + return memModInside(LII, SII); + + // LIBB is different from SIBB. We neet to scan LI to the end of + // LIBB + begin of SIBB to SI + BasicBlocks between LIBB and SIBB. + if (memModInside(LII, LIBB->end())) + return true; + if (memModInside(SIBB->begin(), SII)) + return true; + + // Collect BasicBlocks to scan between LIBB and SIBB into BBSet . + // Limit the maximum number of BasicBlocks to 3 to protect compile time. + const int MaxBB = 3; + SmallPtrSet BBSet; + SmallVector WorkSet; + WorkSet.push_back(SIBB); + do { + const BasicBlock *BB = WorkSet.pop_back_val(); + for (const BasicBlock *Pred : predecessors(BB)) { + if (Pred != LIBB && !BBSet.count(Pred)) { + BBSet.insert(Pred); + if (BBSet.size() > MaxBB) + return true; + WorkSet.push_back(Pred); + } + } + } while (!WorkSet.empty()); + + for (const BasicBlock *BB : BBSet) { + if (memModInside(BB->begin(), BB->end())) + return true; + } + return false; +} + +/// Analyze or ((and (load P), \p Cst), \p MaskedVal). Update \p ActualModBits +/// with the number of bits of the original load to be modified, and update +/// \p ShiftBits with the pos of the first bit to be modified bit. If the +/// analysis result shows we can store the MaskedVal after some change but +/// without using original load, keep \p StoreWithoutLoad as true. +static void analyzeOrAndPattern(Value &MaskedVal, ConstantInt &Cst, + unsigned &ShiftBits, unsigned &ActualModBits, + bool &StoreWithoutLoad, StoreInst &SI, + const DataLayout &DL) { + // Cst is the mask. Analyze the pattern of mask after sext it to uint64_t. We + // will handle patterns like either 0..01..1 or 1..10..01..1 + APInt Mask = Cst.getValue(); + unsigned MBitWidth = Mask.getBitWidth(); + unsigned MaskLeadOnes = Mask.countLeadingOnes(); + unsigned MaskTrailOnes = Mask.countTrailingOnes(); + unsigned MaskMidZeros = !MaskLeadOnes + ? Mask.countLeadingZeros() + : Mask.ashr(MaskTrailOnes).countTrailingZeros(); + + StoreWithoutLoad = true; + // See if we have a continuous run of zeros. + if (MaskMidZeros == 0 || + MaskLeadOnes + MaskMidZeros + MaskTrailOnes != MBitWidth) + StoreWithoutLoad = false; + + // Check MaskedVal only provides nonzero bits within range from lowbits + // (MaskTrailOnes) to highbits (MaskTrailOnes + MaskMidZeros). + APInt BitMask = ~APInt::getBitsSet(MBitWidth, MaskTrailOnes, + MaskTrailOnes + MaskMidZeros); + + // Find out the range in which 1 appears in MaskedVal. + APInt KnownOne(MBitWidth, 0), KnownZero(MBitWidth, 0); + computeKnownBits(&MaskedVal, KnownZero, KnownOne, DL, 0); + + ActualModBits = MaskMidZeros; + ShiftBits = MaskTrailOnes; + // Check !IC.MaskedValueIsZero(MaskedVal, BitMask) by inlining the call + // because we want to reuse the result of computeKnownBits to compute + // ShiftBits and ActualModBits. + if ((KnownZero & BitMask) != BitMask) { + StoreWithoutLoad = false; + unsigned Lower = KnownOne.countTrailingZeros(); + unsigned Higher = MBitWidth - KnownOne.countLeadingZeros(); + ShiftBits = std::min(Lower, MaskTrailOnes); + ActualModBits = std::max(Higher, MaskTrailOnes + MaskMidZeros) - ShiftBits; + } +} + +/// Analyze \p Val = or/xor/and ((load P), \p Cst). Update \p ActualModBits +/// with the number of bits of the original load to be modified, and update +/// \p ShiftBits with the pos of the first bit to be modified. +static void analyzeBOpPattern(Value &Val, ConstantInt &Cst, unsigned &ShiftBits, + unsigned &ActualModBits) { + APInt Mask = Cst.getValue(); + BinaryOperator *BOP = cast(&Val); + if (BOP->getOpcode() == Instruction::And) + Mask = ~Mask; + + ShiftBits = Mask.countTrailingZeros(); + ActualModBits = Mask.getBitWidth() - ShiftBits; + if (ActualModBits) + ActualModBits = ActualModBits - Mask.countLeadingZeros(); +} + +/// Update \p ActualModBits and \p ShiftBits so the updated \p ActualModBits +/// bits can form a legal type and also cover all the modified bits. +static void updateShiftAndModifiedBits(unsigned &ActualModBits, + unsigned &ShiftBits, unsigned TBits, + unsigned Align, LLVMContext &Context, + const DataLayout &DL, + const TargetLowering &TLI) { + unsigned NewModBits = PowerOf2Ceil(ActualModBits); + Type *NewTy = Type::getIntNTy(Context, NewModBits); + int NewShiftBits; + + // Check if we can find a NewShiftBits for the NewModBits, so that + // NewShiftBits and NewModBits forms a new range covering the old + // modified range without worsening alignment. + auto coverOldRange = [&]() -> bool { + unsigned MAlign = MinAlign(Align, DL.getABITypeAlignment(NewTy)); + NewShiftBits = ShiftBits - ShiftBits % (MAlign * 8); + while (NewShiftBits >= 0) { + if (NewModBits + NewShiftBits <= TBits && + NewModBits + NewShiftBits >= ActualModBits + ShiftBits) + return true; + NewShiftBits -= MAlign * 8; + } + return false; + }; + // See whether we can store NewTy legally. + auto isStoreLegalType = [&]() -> bool { + EVT OldEVT = + TLI.getValueType(DL, Type::getIntNTy(Context, PowerOf2Ceil(TBits))); + EVT NewEVT = TLI.getValueType(DL, NewTy); + return TLI.isOperationLegalOrCustom(ISD::STORE, NewEVT) || + TLI.isTruncStoreLegalOrCustom(OldEVT, NewEVT); + }; + // Try to find the minimal NewModBits which can form a legal type and cover + // all the old modified bits. + while (NewModBits < TBits && (!isStoreLegalType() || !coverOldRange())) { + NewModBits = NextPowerOf2(NewModBits); + NewTy = Type::getIntNTy(Context, NewModBits); + } + ActualModBits = NewModBits; + ShiftBits = NewShiftBits; +} + +// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst +// is a byte mask indicating a consecutive number of bytes, check to see if +// Y is known to provide just those bytes. If so, what the original store +// sequence is doing is "load P to V; replace some bytes in V with corresponding +// bytes from Y via bit manipulation; then store the updated V to P". We can +// replace the sequence with a single (narrower) store, and save the load and +// bit operations (The first shrink transformation). +// +// If this is "store (or/and/xor (load P), cst), we can know the maximum range +// where the load value will be modified and shrink the size of the store if +// only the shrinked size can still cover the modified range (the second shrink +// transformation). Especially when the original size is illegal and the the +// size after shrink is legal, the transformation will be beneficial. +// +// For the first pattern, when the first shrink transformation fails, we can +// still try the second one because from the first pattern, we can also know +// the range where the load value will be modified. +static bool reduceLoadOpsStoreWidth(StoreInst &SI, const DataLayout &DL, + const TargetLowering &TLI) { + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + Type *StoreTy = Val->getType(); + if (StoreTy->isVectorTy() || !StoreTy->isIntegerTy() || !Val->hasOneUse()) + return false; + + unsigned TBits = DL.getTypeStoreSizeInBits(StoreTy); + if (TBits != DL.getTypeStoreSizeInBits(StoreTy)) + return false; + + LoadInst *LI; + Value *MaskedVal = nullptr; + ConstantInt *Cst; + // Match "or((and (load P), cst), Y)" or "or/and/xor((load P), cst)" or + // their exprs after commutation. + bool OrAndPattern = false; + if (!(OrAndPattern = match(Val, m_c_Or(m_And(m_Load(LI), m_ConstantInt(Cst)), + m_Value(MaskedVal)))) && + !match(Val, m_c_Or(m_Load(LI), m_ConstantInt(Cst))) && + !match(Val, m_c_And(m_Load(LI), m_ConstantInt(Cst))) && + !match(Val, m_c_Xor(m_Load(LI), m_ConstantInt(Cst)))) + return false; + + // LI should have the same address as SI. + if (LI->getOperand(0) != Ptr) + return false; + + // Make sure the mem SI accesses is not modified between LI and SI. + if (memModBetween(*LI, SI)) + return false; + + // Ideally, we hope we can optimize the load-andor-store sequence to a + // single shrinked store of shr+truncated MaskedVal without accessing + // the original load, but if we cannot, we will try to optimize the + // sequence into a shrinked store of the truncated Val. + // For load-bop-store sequence, StoreWithoutLoad will always be false. + bool StoreWithoutLoad = false; + // ActualModBits indicates the number of bits of the original load + // to be modified. + // ShiftBits indicates the position of the first bit to be modified. + // Both vals will be populated in analyzeXXXPattern below. + unsigned ActualModBits; + unsigned ShiftBits; + if (OrAndPattern) + analyzeOrAndPattern(*MaskedVal, *Cst, ShiftBits, ActualModBits, + StoreWithoutLoad, SI, DL); + else + analyzeBOpPattern(*Val, *Cst, ShiftBits, ActualModBits); + + // Adjust ActualModBits down to be from the actual size of store. + if (ShiftBits > TBits) { + ShiftBits = 0; + ActualModBits = TBits; + } else if (ShiftBits + ActualModBits > TBits) { + ActualModBits = TBits - ShiftBits; + } + + unsigned StOffset; + if (StoreWithoutLoad) { + // Get the offset from Ptr for the shrinked store. + if (DL.isBigEndian()) + StOffset = TBits - ShiftBits - ActualModBits; + else + StOffset = ShiftBits; + if (StOffset % 8 != 0) + StoreWithoutLoad = false; + else + StOffset = StOffset / 8; + + // If ActualModBits is not the length of legal type, we cannot + // store MaskedVal directly. + if (ActualModBits != 8 && ActualModBits != 16 && ActualModBits != 32) + StoreWithoutLoad = false; + } + + unsigned Align = SI.getAlignment(); + LLVMContext &Context = SI.getContext(); + // If we are shrink the store of Val, update ActualModBits and ShiftBits + // to ensure the shrinked store is of legal type. + if (!StoreWithoutLoad) { + // If we cannot do StoreWithoutLoad shrink, do the simple shrink only + // when StoreTy is illegal. + if (TLI.isOperationLegalOrCustom(ISD::STORE, TLI.getValueType(DL, StoreTy))) + return false; + if (!ActualModBits) + return false; + updateShiftAndModifiedBits(ActualModBits, ShiftBits, TBits, Align, Context, + DL, TLI); + if (ActualModBits >= TBits) + return false; + + if (DL.isBigEndian()) + StOffset = (TBits - ShiftBits - ActualModBits) / 8; + else + StOffset = ShiftBits / 8; + } + + // Start shrinking the size of the store. + Value *NewPtr = Ptr; + unsigned AS = cast(Ptr->getType())->getAddressSpace(); + IRBuilder<> Builder(Context); + Builder.SetInsertPoint(&SI); + if (StOffset) { + ConstantInt *Idx = ConstantInt::get(Type::getInt32Ty(Context), StOffset); + NewPtr = + Builder.CreateBitCast(Ptr, Type::getInt8PtrTy(Context, AS), "cast"); + NewPtr = + Builder.CreateGEP(Type::getInt8Ty(Context), NewPtr, Idx, "uglygep"); + Align = MinAlign(StOffset, Align); + } + Type *NewTy = Type::getIntNTy(Context, ActualModBits); + NewPtr = Builder.CreateBitCast(NewPtr, NewTy->getPointerTo(AS), "cast"); + APInt ModifiedCst = Cst->getValue().lshr(ShiftBits).trunc(ActualModBits); + ConstantInt *NewCst = ConstantInt::get(Context, ModifiedCst); + + Value *NewVal; + if (OrAndPattern) { + // Shift and truncate MaskedVal. + Value *Trunc; + if (auto MVCst = dyn_cast(MaskedVal)) { + ModifiedCst = MVCst->getValue().lshr(ShiftBits).trunc(ActualModBits); + Trunc = ConstantInt::get(Context, ModifiedCst); + } else { + Value *ShiftedVal = ShiftBits + ? Builder.CreateLShr(MaskedVal, ShiftBits, "lshr") + : MaskedVal; + Trunc = Builder.CreateTruncOrBitCast(ShiftedVal, NewTy, "trunc"); + } + // Create NewVal to store. + if (StoreWithoutLoad) { + NewVal = Trunc; + } else { + Value *NewLoad = Builder.CreateAlignedLoad(NewPtr, Align, "load.trunc"); + Value *NewAnd = Builder.CreateAnd(NewLoad, NewCst, "and.trunc"); + NewVal = Builder.CreateOr(NewAnd, Trunc, "or.trunc"); + } + } else { + Value *NewLoad = Builder.CreateAlignedLoad(NewPtr, Align, "load.trunc"); + // Create NewVal to store. + BinaryOperator *BOP = cast(Val); + switch (BOP->getOpcode()) { + default: + break; + case Instruction::And: + NewVal = Builder.CreateAnd(NewLoad, NewCst, "and.trunc"); + break; + case Instruction::Or: + NewVal = Builder.CreateOr(NewLoad, NewCst, "or.trunc"); + break; + case Instruction::Xor: + NewVal = Builder.CreateXor(NewLoad, NewCst, "xor.trunc"); + break; + } + } + // Create the new store and remove the old one. + Builder.CreateAlignedStore(NewVal, NewPtr, Align); + SI.eraseFromParent(); + return true; +} + bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) { // Bail out if we inserted the instruction to prevent optimizations from // stepping on each other's toes. @@ -5775,6 +6111,8 @@ if (StoreInst *SI = dyn_cast(I)) { if (TLI && splitMergedValStore(*SI, *DL, *TLI)) return true; + if (TLI && reduceLoadOpsStoreWidth(*SI, *DL, *TLI)) + return true; SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); if (TLI) { unsigned AS = SI->getPointerAddressSpace(); Index: test/CodeGen/ARM/bitfield-store.ll =================================================================== --- test/CodeGen/ARM/bitfield-store.ll +++ test/CodeGen/ARM/bitfield-store.ll @@ -0,0 +1,364 @@ +; RUN: opt < %s -mtriple=arm-eabi -codegenprepare -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %or.trunc = or i32 %load.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i32* +; CHECK: %load.trunc = load i32, i32* %cast, align 1 +; CHECK: %and.trunc = and i32 %load.trunc, -128 +; CHECK: %or.trunc = or i32 %and.trunc, 384 +; CHECK: store i32 %or.trunc, i32* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i32 +; CHECK: %load.trunc = load i32, i32* %cast1, align 1 +; CHECK: %and.trunc = and i32 %load.trunc, -33 +; CHECK: %or.trunc = or i32 %and.trunc, %trunc +; CHECK: store i32 %or.trunc, i32* %cast1, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i32* +; CHECK: %load.trunc = load i32, i32* %cast1, align 2 +; CHECK: %or.trunc = or i32 %load.trunc, 272 +; CHECK: store i32 %or.trunc, i32* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: test/CodeGen/ARM/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -12,13 +12,9 @@ ; ; BE-LABEL: i24_or: ; BE: @ BB#0: -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: ldrb r2, [r0, #2] -; BE-NEXT: orr r1, r2, r1, lsl #8 +; BE-NEXT: ldrh r1, [r0, #1] ; BE-NEXT: orr r1, r1, #384 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] +; BE-NEXT: strh r1, [r0, #1] ; BE-NEXT: mov pc, lr %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -29,36 +25,23 @@ define void @i24_and_or(i24* %a) { ; LE-LABEL: i24_and_or: ; LE: @ BB#0: -; LE-NEXT: ldrb r1, [r0, #2] -; LE-NEXT: ldrh r2, [r0] -; LE-NEXT: orr r1, r2, r1, lsl #16 -; LE-NEXT: ldr r2, .LCPI1_0 +; LE-NEXT: ldrh r1, [r0] +; LE-NEXT: mov r2, #16256 +; LE-NEXT: orr r2, r2, #49152 ; LE-NEXT: orr r1, r1, #384 ; LE-NEXT: and r1, r1, r2 ; LE-NEXT: strh r1, [r0] -; LE-NEXT: lsr r1, r1, #16 -; LE-NEXT: strb r1, [r0, #2] ; LE-NEXT: mov pc, lr -; LE-NEXT: .p2align 2 -; LE-NEXT: @ BB#1: -; LE-NEXT: .LCPI1_0: -; LE-NEXT: .long 16777088 @ 0xffff80 ; ; BE-LABEL: i24_and_or: ; BE: @ BB#0: -; BE-NEXT: ldrh r1, [r0] -; BE-NEXT: mov r2, #384 -; BE-NEXT: orr r1, r2, r1, lsl #8 -; BE-NEXT: ldr r2, .LCPI1_0 +; BE-NEXT: ldrh r1, [r0, #1] +; BE-NEXT: mov r2, #16256 +; BE-NEXT: orr r2, r2, #49152 +; BE-NEXT: orr r1, r1, #384 ; BE-NEXT: and r1, r1, r2 -; BE-NEXT: strb r1, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] -; BE-NEXT: mov pc, lr -; BE-NEXT: .p2align 2 -; BE-NEXT: @ BB#1: -; BE-NEXT: .LCPI1_0: -; BE-NEXT: .long 16777088 @ 0xffff80 +; BE-NEXT: strh r1, [r0, #1] +; BE-NEXT: mov pc, lr %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 %d = or i24 %c, 384 @@ -69,37 +52,19 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; LE-LABEL: i24_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldrb r2, [r0, #2] -; LE-NEXT: ldrh r3, [r0] -; LE-NEXT: orr r2, r3, r2, lsl #16 -; LE-NEXT: ldr r3, .LCPI2_0 -; LE-NEXT: and r2, r2, r3 -; LE-NEXT: lsr r3, r2, #16 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: strb r3, [r0, #2] -; LE-NEXT: strh r1, [r0] +; LE-NEXT: ldrb r2, [r0, #1] +; LE-NEXT: and r2, r2, #223 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: strb r1, [r0, #1] ; LE-NEXT: mov pc, lr -; LE-NEXT: .p2align 2 -; LE-NEXT: @ BB#1: -; LE-NEXT: .LCPI2_0: -; LE-NEXT: .long 16769023 @ 0xffdfff ; ; BE-LABEL: i24_insert_bit: ; BE: @ BB#0: -; BE-NEXT: ldrh r2, [r0] -; BE-NEXT: ldrb r3, [r0, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: ldr r3, .LCPI2_0 -; BE-NEXT: and r2, r2, r3 -; BE-NEXT: orr r1, r2, r1, lsl #13 -; BE-NEXT: strb r2, [r0, #2] -; BE-NEXT: lsr r1, r1, #8 -; BE-NEXT: strh r1, [r0] -; BE-NEXT: mov pc, lr -; BE-NEXT: .p2align 2 -; BE-NEXT: @ BB#1: -; BE-NEXT: .LCPI2_0: -; BE-NEXT: .long 16769023 @ 0xffdfff +; BE-NEXT: ldrb r2, [r0, #1] +; BE-NEXT: and r2, r2, #223 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: strb r1, [r0, #1] +; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 %extbit.shl = shl nuw nsw i24 %extbit, 13 @@ -119,19 +84,9 @@ ; ; BE-LABEL: i56_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: ldrb r3, [r1, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a %b = or i56 %aa, 384 @@ -150,19 +105,10 @@ ; ; BE-LABEL: i56_and_or: ; BE: @ BB#0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: mov r3, #128 -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: strb r3, [r1, #2] -; BE-NEXT: lsl r2, r2, #8 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 -; BE-NEXT: str r1, [r0] +; BE-NEXT: ldr r1, [r0, #3] +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: bic r1, r1, #127 +; BE-NEXT: str r1, [r0, #3] ; BE-NEXT: mov pc, lr %b = load i56, i56* %a, align 1 @@ -175,31 +121,18 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; LE-LABEL: i56_insert_bit: ; LE: @ BB#0: -; LE-NEXT: ldr r2, [r0] -; LE-NEXT: bic r2, r2, #8192 -; LE-NEXT: orr r1, r2, r1, lsl #13 -; LE-NEXT: str r1, [r0] +; LE-NEXT: ldr r2, [r0, #1] +; LE-NEXT: bic r2, r2, #32 +; LE-NEXT: orr r1, r2, r1, lsl #5 +; LE-NEXT: str r1, [r0, #1] ; LE-NEXT: mov pc, lr ; ; BE-LABEL: i56_insert_bit: ; BE: @ BB#0: -; BE-NEXT: .save {r11, lr} -; BE-NEXT: push {r11, lr} -; BE-NEXT: mov r2, r0 -; BE-NEXT: ldr lr, [r0] -; BE-NEXT: ldrh r12, [r2, #4]! -; BE-NEXT: ldrb r3, [r2, #2] -; BE-NEXT: orr r12, r3, r12, lsl #8 -; BE-NEXT: orr r3, r12, lr, lsl #24 -; BE-NEXT: bic r3, r3, #8192 -; BE-NEXT: orr r1, r3, r1, lsl #13 -; BE-NEXT: strb r3, [r2, #2] -; BE-NEXT: lsr r3, r1, #8 -; BE-NEXT: strh r3, [r2] -; BE-NEXT: bic r2, lr, #255 -; BE-NEXT: orr r1, r2, r1, lsr #24 -; BE-NEXT: str r1, [r0] -; BE-NEXT: pop {r11, lr} +; BE-NEXT: ldr r2, [r0, #2] +; BE-NEXT: bic r2, r2, #32 +; BE-NEXT: orr r1, r2, r1, lsl #5 +; BE-NEXT: str r1, [r0, #2] ; BE-NEXT: mov pc, lr %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 Index: test/CodeGen/X86/bitfield-store.ll =================================================================== --- test/CodeGen/X86/bitfield-store.ll +++ test/CodeGen/X86/bitfield-store.ll @@ -0,0 +1,363 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -codegenprepare -S | FileCheck %s +; Check bitfield store is shrinked properly in cases below. + +; class A1 { +; unsigned long f1:8; +; unsigned long f2:3; +; } a1; +; a1.f1 = n; +; +; The bitfield store can be shrinked from i16 to i8. +; CHECK-LABEL: @test1( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %t0 = trunc i64 %conv to i16 +; CHECK: %bf.value = and i16 %t0, 255 +; CHECK: %trunc = trunc i16 %bf.value to i8 +; CHECK: store i8 %trunc, i8* bitcast (%class.A1* @a1 to i8*), align 8 + +%class.A1 = type { i16, [6 x i8] } +@a1 = local_unnamed_addr global %class.A1 zeroinitializer, align 8 + +define void @test1(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %t0 = trunc i64 %conv to i16 + %bf.load = load i16, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + %bf.value = and i16 %t0, 255 + %bf.clear = and i16 %bf.load, -256 + %bf.set = or i16 %bf.clear, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A1, %class.A1* @a1, i32 0, i32 0), align 8 + ret void +} + +; class A2 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a2; +; a2.f1 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test2( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A2* @a2 to i16*), align 8 + +%class.A2 = type { i24, [4 x i8] } +@a2 = local_unnamed_addr global %class.A2 zeroinitializer, align 8 + +define void @test2(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A2* @a2 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.clear = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear, %bf.value + store i32 %bf.set, i32* bitcast (%class.A2* @a2 to i32*), align 8 + ret void +} + +; class A3 { +; unsigned long f1:32; +; unsigned long f2:3; +; } a3; +; a3.f1 = n; +; The bitfield store can be shrinked from i64 to i32. +; CHECK-LABEL: @test3( +; CHECK: %conv = zext i32 %n to i64 +; CHECK: %bf.value = and i64 %conv, 4294967295 +; CHECK: %trunc = trunc i64 %bf.value to i32 +; CHECK: store i32 %trunc, i32* bitcast (%class.A3* @a3 to i32*), align 8 + +%class.A3 = type { i40 } +@a3 = local_unnamed_addr global %class.A3 zeroinitializer, align 8 + +define void @test3(i32 %n) { +entry: + %conv = zext i32 %n to i64 + %bf.load = load i64, i64* bitcast (%class.A3* @a3 to i64*), align 8 + %bf.value = and i64 %conv, 4294967295 + %bf.clear = and i64 %bf.load, -4294967296 + %bf.set = or i64 %bf.clear, %bf.value + store i64 %bf.set, i64* bitcast (%class.A3* @a3 to i64*), align 8 + ret void +} + +; class A4 { +; unsigned long f1:13; +; unsigned long f2:3; +; } a4; +; a4.f1 = n; +; The bitfield store cannot be shrinked because the field is not 8/16/32 bits. +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: %t0 = trunc i32 %n to i16 +; CHECK-NEXT: %bf.value = and i16 %t0, 8191 +; CHECK-NEXT: %bf.clear3 = and i16 %bf.load, -8192 +; CHECK-NEXT: %bf.set = or i16 %bf.clear3, %bf.value +; CHECK-NEXT: store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 +; CHECK-NEXT: ret void + +%class.A4 = type { i16, [6 x i8] } +@a4 = local_unnamed_addr global %class.A4 zeroinitializer, align 8 + +define void @test4(i32 %n) { +entry: + %bf.load = load i16, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + %t0 = trunc i32 %n to i16 + %bf.value = and i16 %t0, 8191 + %bf.clear3 = and i16 %bf.load, -8192 + %bf.set = or i16 %bf.clear3, %bf.value + store i16 %bf.set, i16* getelementptr inbounds (%class.A4, %class.A4* @a4, i64 0, i32 0), align 8 + ret void +} + +; class A5 { +; unsigned long f1:3; +; unsigned long f2:16; +; } a5; +; a5.f2 = n; +; The bitfield store cannot be shrinked because it is not aligned on +; 16bits boundary. +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: %bf.value = and i32 %n, 65535 +; CHECK-NEXT: %bf.shl = shl i32 %bf.value, 3 +; CHECK-NEXT: %bf.clear = and i32 %bf.load, -524281 +; CHECK-NEXT: %bf.set = or i32 %bf.clear, %bf.shl +; CHECK-NEXT: store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 +; CHECK-NEXT: ret void + +%class.A5 = type { i24, [4 x i8] } +@a5 = local_unnamed_addr global %class.A5 zeroinitializer, align 8 + +define void @test5(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A5* @a5 to i32*), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 3 + %bf.clear = and i32 %bf.load, -524281 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* bitcast (%class.A5* @a5 to i32*), align 8 + ret void +} + +; class A6 { +; unsigned long f1:16; +; unsigned long f2:3; +; } a6; +; a6.f1 = n; +; The bitfield store can be shrinked from i32 to i16 even the load and store +; are in different BasicBlocks. +; CHECK-LABEL: @test6( +; CHECK: if.end: +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %trunc = trunc i32 %bf.value to i16 +; CHECK: store i16 %trunc, i16* bitcast (%class.A6* @a6 to i16*), align 8 + +%class.A6 = type { i24, [4 x i8] } +@a6 = local_unnamed_addr global %class.A6 zeroinitializer, align 8 + +define void @test6(i32 %n) { +entry: + %bf.load = load i32, i32* bitcast (%class.A6* @a6 to i32*), align 8 + %bf.clear = and i32 %bf.load, 65535 + %cmp = icmp eq i32 %bf.clear, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %bf.value = and i32 %n, 65535 + %bf.clear3 = and i32 %bf.load, -65536 + %bf.set = or i32 %bf.clear3, %bf.value + store i32 %bf.set, i32* bitcast (%class.A6* @a6 to i32*), align 8 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; class A7 { +; unsigned long f1:16; +; unsigned long f2:16; +; } a7; +; a7.f2 = n; +; The bitfield store can be shrinked from i32 to i16. +; CHECK-LABEL: @test7( +; CHECK: %bf.value = and i32 %n, 65535 +; CHECK: %bf.shl = shl i32 %bf.value, 16 +; CHECK: %lshr = lshr i32 %bf.shl, 16 +; CHECK: %trunc = trunc i32 %lshr to i16 +; CHECK: store i16 %trunc, i16* bitcast (i8* getelementptr (i8, i8* bitcast (%class.A7* @a7 to i8*), i32 2) to i16*), align 2 + +%class.A7 = type { i32, [4 x i8] } +@a7 = local_unnamed_addr global %class.A7 zeroinitializer, align 8 + +define void @test7(i32 %n) { +entry: + %bf.load = load i32, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + %bf.value = and i32 %n, 65535 + %bf.shl = shl i32 %bf.value, 16 + %bf.clear = and i32 %bf.load, 65535 + %bf.set = or i32 %bf.clear, %bf.shl + store i32 %bf.set, i32* getelementptr inbounds (%class.A7, %class.A7* @a7, i32 0, i32 0), align 8 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_or(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = or i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i8. +; CHECK-LABEL: @i24_and( +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -7 +; CHECK: store i8 %and.trunc, i8* %uglygep, align 1 +; +define void @i24_and(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = and i24 %aa, -1537 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_xor( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %xor.trunc = xor i16 %load.trunc, 384 +; CHECK: store i16 %xor.trunc, i16* %cast, align 1 +; +define void @i24_xor(i24* %a) { + %aa = load i24, i24* %a, align 1 + %b = xor i24 %aa, 384 + store i24 %b, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i24 store +; to i16. +; CHECK-LABEL: @i24_and_or( +; CHECK: %cast = bitcast i24* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i24_and_or(i24* %a) { + %b = load i24, i24* %a, align 1 + %c = and i24 %b, -128 + %d = or i24 %c, 384 + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i24 store to i8. +; CHECK-LABEL: @i24_insert_bit( +; CHECK: %extbit = zext i1 %bit to i24 +; CHECK: %extbit.shl = shl nuw nsw i24 %extbit, 13 +; CHECK: %cast = bitcast i24* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %lshr = lshr i24 %extbit.shl, 8 +; CHECK: %trunc = trunc i24 %lshr to i8 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i24 + %b = load i24, i24* %a, align 1 + %extbit.shl = shl nuw nsw i24 %extbit, 13 + %c = and i24 %b, -8193 + %d = or i24 %c, %extbit.shl + store i24 %d, i24* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %or.trunc = or i16 %load.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_or(i56* %a) { + %aa = load i56, i56* %a, align 1 + %b = or i56 %aa, 384 + store i56 %b, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_and_or( +; CHECK: %cast = bitcast i56* %a to i16* +; CHECK: %load.trunc = load i16, i16* %cast, align 1 +; CHECK: %and.trunc = and i16 %load.trunc, -128 +; CHECK: %or.trunc = or i16 %and.trunc, 384 +; CHECK: store i16 %or.trunc, i16* %cast, align 1 +; +define void @i56_and_or(i56* %a) { + %b = load i56, i56* %a, align 1 + %c = and i56 %b, -128 + %d = or i56 %c, 384 + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can shrink the i56 store to i8. +; CHECK-LABEL: @i56_insert_bit( +; CHECK: %extbit = zext i1 %bit to i56 +; CHECK: %extbit.shl = shl nuw nsw i56 %extbit, 13 +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 1 +; CHECK: %lshr = lshr i56 %extbit.shl, 8 +; CHECK: %trunc = trunc i56 %lshr to i8 +; CHECK: %load.trunc = load i8, i8* %uglygep, align 1 +; CHECK: %and.trunc = and i8 %load.trunc, -33 +; CHECK: %or.trunc = or i8 %and.trunc, %trunc +; CHECK: store i8 %or.trunc, i8* %uglygep, align 1 +; +define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { + %extbit = zext i1 %bit to i56 + %b = load i56, i56* %a, align 1 + %extbit.shl = shl nuw nsw i56 %extbit, 13 + %c = and i56 %b, -8193 + %d = or i56 %c, %extbit.shl + store i56 %d, i56* %a, align 1 + ret void +} + +; Cannot remove the load and bit operations, but can still shrink the i56 store +; to i16. +; CHECK-LABEL: @i56_or_alg2( +; CHECK: %cast = bitcast i56* %a to i8* +; CHECK: %uglygep = getelementptr i8, i8* %cast, i32 2 +; CHECK: %cast1 = bitcast i8* %uglygep to i16* +; CHECK: %load.trunc = load i16, i16* %cast1, align 2 +; CHECK: %or.trunc = or i16 %load.trunc, 272 +; CHECK: store i16 %or.trunc, i16* %cast1, align 2 +; +define void @i56_or_alg2(i56* %a) { + %aa = load i56, i56* %a, align 2 + %b = or i56 %aa, 17825792 + store i56 %b, i56* %a, align 2 + ret void +} + + Index: test/CodeGen/X86/illegal-bitfield-loadstore.ll =================================================================== --- test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -4,13 +4,7 @@ define void @i24_or(i24* %a) { ; CHECK-LABEL: i24_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 @@ -22,14 +16,9 @@ ; CHECK-LABEL: i24_and_or: ; CHECK: # BB#0: ; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 @@ -41,17 +30,11 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { ; CHECK-LABEL: i24_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: movw %ax, (%rdi) +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 @@ -65,19 +48,7 @@ define void @i56_or(i56* %a) { ; CHECK-LABEL: i56_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX %ECX %RCX %RCX -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) +; CHECK-NEXT: orw $384, (%rdi) # imm = 0x180 ; CHECK-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 @@ -88,22 +59,10 @@ define void @i56_and_or(i56* %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: orq %rcx, %rax -; CHECK-NEXT: orq $384, %rax # imm = 0x180 -; CHECK-NEXT: movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) -; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: movb %cl, 6(%rdi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: orl $384, %eax # imm = 0x180 +; CHECK-NEXT: andl $65408, %eax # imm = 0xFF80 +; CHECK-NEXT: movw %ax, (%rdi) ; CHECK-NEXT: retq %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 @@ -115,23 +74,11 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: movl (%rdi), %ecx -; CHECK-NEXT: orq %rdx, %rcx -; CHECK-NEXT: shlq $13, %rax -; CHECK-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rcx, %rdx -; CHECK-NEXT: orq %rdx, %ra -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $48, %rdx -; CHECK-NEXT: movb %dl, 6(%rdi) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: shlb $5, %sil +; CHECK-NEXT: movb 1(%rdi), %al +; CHECK-NEXT: andb $-33, %al +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: movb %al, 1(%rdi) ; CHECK-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1