diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3735,6 +3735,14 @@ const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const; + /// Determines the optimal base offset to fit offsets into the addressing mode + /// after CodeGenPrepare::splitLargeGEPOffsets. Return (BaseOffset, End) which + /// indicates that we use BaseOffset as the base offset for GEPs in range [0, + /// End). + virtual std::pair selectBaseOffsetForPrefix( + ArrayRef, int64_t>> + LargeOffsetGEPs) const; + /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6095,91 +6095,91 @@ // Skip if all the GEPs have the same offsets. if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second) continue; - GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first; - int64_t BaseOffset = LargeOffsetGEPs.begin()->second; - Value *NewBaseGEP = nullptr; - - auto *LargeOffsetGEP = LargeOffsetGEPs.begin(); - while (LargeOffsetGEP != LargeOffsetGEPs.end()) { - GetElementPtrInst *GEP = LargeOffsetGEP->first; - int64_t Offset = LargeOffsetGEP->second; - if (Offset != BaseOffset) { - TargetLowering::AddrMode AddrMode; - AddrMode.HasBaseReg = true; - AddrMode.BaseOffs = Offset - BaseOffset; - // The result type of the GEP might not be the type of the memory - // access. - if (!TLI->isLegalAddressingMode(*DL, AddrMode, - GEP->getResultElementType(), - GEP->getAddressSpace())) { - // We need to create a new base if the offset to the current base is - // too large to fit into the addressing mode. So, a very large struct - // may be split into several parts. - BaseGEP = GEP; - BaseOffset = Offset; - NewBaseGEP = nullptr; + while (!LargeOffsetGEPs.empty()) { + auto [BaseOffset, End] = TLI->selectBaseOffsetForPrefix(LargeOffsetGEPs); + Value *NewBaseGEP = nullptr; + + auto *LargeOffsetGEP = LargeOffsetGEPs.begin(); + for (size_t Idx = 0; Idx < End; ++Idx) { + GetElementPtrInst *GEP = LargeOffsetGEP->first; + int64_t Offset = LargeOffsetGEP->second; + if (Offset != BaseOffset) { + TargetLowering::AddrMode AddrMode; + AddrMode.HasBaseReg = true; + AddrMode.BaseOffs = Offset - BaseOffset; + // The result type of the GEP might not be the type of the memory + // access. + if (!TLI->isLegalAddressingMode(*DL, AddrMode, + GEP->getResultElementType(), + GEP->getAddressSpace())) { + // We need to create a new base if the offset to the current base is + // too large to fit into the addressing mode. So, a very large + // struct may be split into several parts. + BaseOffset = Offset; + NewBaseGEP = nullptr; + } } - } - // Generate a new GEP to replace the current one. - LLVMContext &Ctx = GEP->getContext(); - Type *PtrIdxTy = DL->getIndexType(GEP->getType()); - Type *I8PtrTy = - Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); - Type *I8Ty = Type::getInt8Ty(Ctx); - - if (!NewBaseGEP) { - // Create a new base if we don't have one yet. Find the insertion - // pointer for the new base first. - BasicBlock::iterator NewBaseInsertPt; - BasicBlock *NewBaseInsertBB; - if (auto *BaseI = dyn_cast(OldBase)) { - // If the base of the struct is an instruction, the new base will be - // inserted close to it. - NewBaseInsertBB = BaseI->getParent(); - if (isa(BaseI)) + // Generate a new GEP to replace the current one. + LLVMContext &Ctx = GEP->getContext(); + Type *PtrIdxTy = DL->getIndexType(GEP->getType()); + Type *I8PtrTy = + Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); + Type *I8Ty = Type::getInt8Ty(Ctx); + + if (!NewBaseGEP) { + // Create a new base if we don't have one yet. Find the insertion + // pointer for the new base first. + BasicBlock::iterator NewBaseInsertPt; + BasicBlock *NewBaseInsertBB; + if (auto *BaseI = dyn_cast(OldBase)) { + // If the base of the struct is an instruction, the new base will be + // inserted close to it. + NewBaseInsertBB = BaseI->getParent(); + if (isa(BaseI)) + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + else if (InvokeInst *Invoke = dyn_cast(BaseI)) { + NewBaseInsertBB = SplitEdge( + NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } else + NewBaseInsertPt = std::next(BaseI->getIterator()); + } else { + // If the current base is an argument or global value, the new base + // will be inserted to the entry block. + NewBaseInsertBB = &GEP->getFunction()->getEntryBlock(); NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); - else if (InvokeInst *Invoke = dyn_cast(BaseI)) { - NewBaseInsertBB = - SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); - NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); - } else - NewBaseInsertPt = std::next(BaseI->getIterator()); - } else { - // If the current base is an argument or global value, the new base - // will be inserted to the entry block. - NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); - NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } + IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); + // Create a new base. + Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset); + NewBaseGEP = OldBase; + if (NewBaseGEP->getType() != I8PtrTy) + NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); + NewBaseGEP = + NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); + NewGEPBases.insert(NewBaseGEP); } - IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); - // Create a new base. - Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset); - NewBaseGEP = OldBase; - if (NewBaseGEP->getType() != I8PtrTy) - NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); - NewBaseGEP = - NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); - NewGEPBases.insert(NewBaseGEP); - } - IRBuilder<> Builder(GEP); - Value *NewGEP = NewBaseGEP; - if (Offset == BaseOffset) { - if (GEP->getType() != I8PtrTy) - NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); - } else { - // Calculate the new offset for the new GEP. - Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset); - NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); + IRBuilder<> Builder(GEP); + Value *NewGEP = NewBaseGEP; + if (Offset == BaseOffset) { + if (GEP->getType() != I8PtrTy) + NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); + } else { + // Calculate the new offset for the new GEP. + Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset); + NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); - if (GEP->getType() != I8PtrTy) - NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); + if (GEP->getType() != I8PtrTy) + NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); + } + replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc); + LargeOffsetGEPID.erase(GEP); + LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP); + GEP->eraseFromParent(); + Changed = true; } - replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc); - LargeOffsetGEPID.erase(GEP); - LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP); - GEP->eraseFromParent(); - Changed = true; } } return Changed; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -284,6 +284,14 @@ return true; } +// By default we use the GEP with smallest offset for all GEPs. +std::pair TargetLowering::selectBaseOffsetForPrefix( + ArrayRef, int64_t>> + LargeOffsetGEPs) const { + return std::make_pair(LargeOffsetGEPs.begin()->second, + LargeOffsetGEPs.size()); +} + /// Soften the operands of a comparison. This code is shared among BR_CC, /// SELECT_CC, and SETCC handlers. void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -765,6 +765,10 @@ MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override; + std::pair selectBaseOffsetForPrefix( + ArrayRef, int64_t>> + LargeOffsetGEPs) const override; + /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling /// conventions. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16959,6 +16959,103 @@ return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY); } +static Type *inferAccessTypeOfGEP(GetElementPtrInst *GEP) { + for (auto User : GEP->users()) { + if (auto AccessType = dyn_cast(User)->getAccessType()) { + return AccessType; + } + } + // fallback + return GEP->getResultElementType(); +} + +// Select the optimal base offset to make offset of GEPs fit into simm12. +std::pair RISCVTargetLowering::selectBaseOffsetForPrefix( + ArrayRef, int64_t>> + LargeOffsetGEPs) const { + int64_t Alignment = 1; + int64_t Range = 4096; // simm12 + // Infer type of memory ops from the result element type of GEPs to use + // compressed load/store insts. + if (Subtarget.hasStdExtCOrZca()) { + Type *EleTy = inferAccessTypeOfGEP(LargeOffsetGEPs.begin()->first); + if (Subtarget.hasStdExtCOrZca() && EleTy->isIntegerTy(32)) { + Alignment = 4; + Range = 128; // simm5 scaled by 4 + } + if (Subtarget.hasStdExtCOrZca() && EleTy->isIntegerTy(64)) { + Alignment = 8; + Range = 256; // simm5 scaled by 8 + } + if (Subtarget.hasStdExtCOrZca() && EleTy->isPointerTy()) { + Alignment = Subtarget.getXLen(); + Range = 32 * Alignment; // simm5 scaled by size of XLEN + } + if ((Subtarget.hasStdExtC() || Subtarget.hasStdExtZcf()) && + EleTy->isFloatTy()) { + Alignment = 4; + Range = 128; // simm5 scaled by 4 + } + if ((Subtarget.hasStdExtC() || Subtarget.hasStdExtZcd()) && + EleTy->isDoubleTy()) { + Alignment = 8; + Range = 256; // simm5 scaled by 8 + } + } + int64_t RangeMin = -Range / 2, RangeMax = Range / 2 - 1; + + auto Iter = LargeOffsetGEPs.begin(); + int64_t Begin = Iter->second; + int64_t End = Iter->second; + // Max + 1 - Min >= Alignment ==> + // (Begin - RangeMin) + 1 - (End - RangeMax) >= Alignment ==> + // End <= Begin + Range - Alignment + while (Iter != LargeOffsetGEPs.end() && + Iter->second <= Begin + Range - Alignment) { + End = Iter->second; + ++Iter; + } + // Select the optimal base offset which satisfies `BaseOffset + RangeMin <= + // Begin <= End <= BaseOffset + RangeMax` + int64_t BaseOffsetMin = End - RangeMax, BaseOffsetMax = Begin - RangeMin; + // Make sure that at least one offset which satisfies the alignment + // requirement can be selected. + assert(BaseOffsetMax + 1 - BaseOffsetMin >= Alignment); + size_t Count = static_cast(Iter - LargeOffsetGEPs.begin()); + while ((BaseOffsetMin - Begin) % Alignment != 0) + ++BaseOffsetMin; + while ((BaseOffsetMax - Begin) % Alignment != 0) + --BaseOffsetMax; + // BaseOffset = BaseOffsetMin + k * Alignment + // Choose optimal offset in the following order: + // 1. in the range of [-2048, 2048). We can use addi to obtain the address. + // 2. divisible by 4096. We can use lui + add to obtain the address. + // 3. Otherwise, we just choose BaseOffsetMin. + int64_t BaseOffset; + if (BaseOffsetMin <= 0 && BaseOffsetMax >= 0) { + BaseOffset = BaseOffsetMax % Alignment; + } else { + BaseOffset = std::abs(BaseOffsetMin) < std::abs(BaseOffsetMax) + ? BaseOffsetMin + : BaseOffsetMax; + } + if (BaseOffset < -2048 || BaseOffset > 2047) { + // k * Alignment is congruent to BaseOffsetMax modulo 4096. + if (Begin % Alignment == 0) { + int64_t Offset = BaseOffsetMax / 4096 * 4096; + while (Offset > BaseOffsetMin) { + if (BaseOffsetMin <= Offset && Offset <= BaseOffsetMax) { + BaseOffset = Offset; + break; + } + Offset -= 4096; + } + } + } + + return std::make_pair(BaseOffset, Count); +} + namespace llvm::RISCVVIntrinsicsTable { #define GET_RISCVVIntrinsicsTable_IMPL diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -506,11 +506,11 @@ ; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma ; CHECK-NEXT: .LBB9_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: addi a4, a0, 32 -; CHECK-NEXT: addi a5, a1, -128 -; CHECK-NEXT: vlse32.v v8, (a5), a3 +; CHECK-NEXT: addi a4, a1, -128 +; CHECK-NEXT: vlse32.v v8, (a4), a3 ; CHECK-NEXT: vlse32.v v9, (a1), a3 ; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: addi a4, a0, 32 ; CHECK-NEXT: vle32.v v11, (a4) ; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vadd.vv v9, v11, v9 @@ -655,11 +655,11 @@ ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB11_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v8, (a1), a3 +; V-NEXT: addi a4, a1, 80 ; V-NEXT: vlse64.v v9, (a4), a3 -; V-NEXT: addi a4, a0, 16 ; V-NEXT: vse64.v v8, (a0) +; V-NEXT: addi a4, a0, 16 ; V-NEXT: vse64.v v9, (a4) ; V-NEXT: addi a2, a2, -4 ; V-NEXT: addi a0, a0, 32 @@ -732,8 +732,8 @@ ; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; V-NEXT: .LBB12_1: # %bb2 ; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: addi a4, a1, 16 ; V-NEXT: vle64.v v8, (a1) +; V-NEXT: addi a4, a1, 16 ; V-NEXT: vle64.v v9, (a4) ; V-NEXT: addi a4, a0, 80 ; V-NEXT: vsse64.v v8, (a0), a3 diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32,RV32I ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV64I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32,RV32C +; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64,RV64C ; Check that memory accesses to array elements with large offsets have those ; offsets split into a base offset, plus a smaller offset that is folded into @@ -13,31 +17,59 @@ ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: lui a2, 20 -; RV32I-NEXT: addi a2, a2, -1920 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: sw a2, -1920(a0) ; RV32I-NEXT: li a3, 1 -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a3, 0(a1) -; RV32I-NEXT: sw a2, 4(a1) +; RV32I-NEXT: sw a3, -1916(a0) +; RV32I-NEXT: sw a3, -1920(a1) +; RV32I-NEXT: sw a2, -1916(a1) ; RV32I-NEXT: ret ; ; RV64I-LABEL: test1: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: ld a0, 0(a0) ; RV64I-NEXT: lui a2, 20 -; RV64I-NEXT: addiw a2, a2, -1920 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: add a0, a0, a2 ; RV64I-NEXT: li a2, 2 -; RV64I-NEXT: sw a2, 0(a0) +; RV64I-NEXT: sw a2, -1920(a0) ; RV64I-NEXT: li a3, 1 -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a3, 0(a1) -; RV64I-NEXT: sw a2, 4(a1) +; RV64I-NEXT: sw a3, -1916(a0) +; RV64I-NEXT: sw a3, -1920(a1) +; RV64I-NEXT: sw a2, -1916(a1) ; RV64I-NEXT: ret +; +; RV32C-LABEL: test1: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: lw a0, 0(a0) +; RV32C-NEXT: lui a2, 20 +; RV32C-NEXT: addi a2, a2, -1976 +; RV32C-NEXT: add a1, a1, a2 +; RV32C-NEXT: add a0, a0, a2 +; RV32C-NEXT: li a2, 2 +; RV32C-NEXT: sw a2, 56(a0) +; RV32C-NEXT: li a3, 1 +; RV32C-NEXT: sw a3, 60(a0) +; RV32C-NEXT: sw a3, 56(a1) +; RV32C-NEXT: sw a2, 60(a1) +; RV32C-NEXT: ret +; +; RV64C-LABEL: test1: +; RV64C: # %bb.0: # %entry +; RV64C-NEXT: ld a0, 0(a0) +; RV64C-NEXT: lui a2, 20 +; RV64C-NEXT: addiw a2, a2, -1976 +; RV64C-NEXT: add a1, a1, a2 +; RV64C-NEXT: add a0, a0, a2 +; RV64C-NEXT: li a2, 2 +; RV64C-NEXT: sw a2, 56(a0) +; RV64C-NEXT: li a3, 1 +; RV64C-NEXT: sw a3, 60(a0) +; RV64C-NEXT: sw a3, 56(a1) +; RV64C-NEXT: sw a2, 60(a1) +; RV64C-NEXT: ret entry: %s = load ptr, ptr %sp %gep0 = getelementptr [65536 x i32], ptr %s, i64 0, i32 20000 @@ -55,31 +87,29 @@ define void @test2(ptr %sp, ptr %t, i32 %n) { ; RV32I-LABEL: test2: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: lw a0, 0(a0) +; RV32I-NEXT: lw a3, 0(a0) +; RV32I-NEXT: li a0, 0 ; RV32I-NEXT: lui a4, 20 -; RV32I-NEXT: addi a4, a4, -1920 ; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: add a0, a0, a4 +; RV32I-NEXT: add a3, a3, a4 ; RV32I-NEXT: blez a2, .LBB1_2 ; RV32I-NEXT: .LBB1_1: # %while_body ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a4, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) -; RV32I-NEXT: mv a3, a4 +; RV32I-NEXT: addi a4, a0, 1 +; RV32I-NEXT: sw a4, -1920(a3) +; RV32I-NEXT: sw a0, -1916(a3) +; RV32I-NEXT: sw a4, -1920(a1) +; RV32I-NEXT: sw a0, -1916(a1) +; RV32I-NEXT: mv a0, a4 ; RV32I-NEXT: blt a4, a2, .LBB1_1 ; RV32I-NEXT: .LBB1_2: # %while_end ; RV32I-NEXT: ret ; ; RV64I-LABEL: test2: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: ld a0, 0(a0) +; RV64I-NEXT: li a3, 0 ; RV64I-NEXT: lui a4, 20 -; RV64I-NEXT: addiw a4, a4, -1920 ; RV64I-NEXT: add a1, a1, a4 ; RV64I-NEXT: add a0, a0, a4 ; RV64I-NEXT: sext.w a2, a2 @@ -87,14 +117,57 @@ ; RV64I-NEXT: .LBB1_1: # %while_body ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: addiw a4, a3, 1 -; RV64I-NEXT: sw a4, 0(a0) -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a4, 0(a1) -; RV64I-NEXT: sw a3, 4(a1) +; RV64I-NEXT: sw a4, -1920(a0) +; RV64I-NEXT: sw a3, -1916(a0) +; RV64I-NEXT: sw a4, -1920(a1) +; RV64I-NEXT: sw a3, -1916(a1) ; RV64I-NEXT: mv a3, a4 ; RV64I-NEXT: blt a4, a2, .LBB1_1 ; RV64I-NEXT: .LBB1_2: # %while_end ; RV64I-NEXT: ret +; +; RV32C-LABEL: test2: +; RV32C: # %bb.0: # %entry +; RV32C-NEXT: li a3, 0 +; RV32C-NEXT: lw a0, 0(a0) +; RV32C-NEXT: lui a4, 20 +; RV32C-NEXT: addi a4, a4, -1976 +; RV32C-NEXT: add a1, a1, a4 +; RV32C-NEXT: add a0, a0, a4 +; RV32C-NEXT: blez a2, .LBB1_2 +; RV32C-NEXT: .LBB1_1: # %while_body +; RV32C-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32C-NEXT: addi a4, a3, 1 +; RV32C-NEXT: sw a4, 56(a0) +; RV32C-NEXT: sw a3, 60(a0) +; RV32C-NEXT: sw a4, 56(a1) +; RV32C-NEXT: sw a3, 60(a1) +; RV32C-NEXT: mv a3, a4 +; RV32C-NEXT: blt a4, a2, .LBB1_1 +; RV32C-NEXT: .LBB1_2: # %while_end +; RV32C-NEXT: ret +; +; RV64C-LABEL: test2: +; RV64C: # %bb.0: # %entry +; RV64C-NEXT: li a3, 0 +; RV64C-NEXT: ld a0, 0(a0) +; RV64C-NEXT: lui a4, 20 +; RV64C-NEXT: addiw a4, a4, -1976 +; RV64C-NEXT: add a1, a1, a4 +; RV64C-NEXT: add a0, a0, a4 +; RV64C-NEXT: sext.w a2, a2 +; RV64C-NEXT: blez a2, .LBB1_2 +; RV64C-NEXT: .LBB1_1: # %while_body +; RV64C-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64C-NEXT: addiw a4, a3, 1 +; RV64C-NEXT: sw a4, 56(a0) +; RV64C-NEXT: sw a3, 60(a0) +; RV64C-NEXT: sw a4, 56(a1) +; RV64C-NEXT: sw a3, 60(a1) +; RV64C-NEXT: mv a3, a4 +; RV64C-NEXT: blt a4, a2, .LBB1_1 +; RV64C-NEXT: .LBB1_2: # %while_end +; RV64C-NEXT: ret entry: %s = load ptr, ptr %sp br label %while_cond @@ -122,27 +195,27 @@ ; instructions. Make sure we use an offset and common base for each of the ; stores. define void @test3(ptr %t) { -; RV32I-LABEL: test3: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a1, 20 -; RV32I-NEXT: addi a1, a1, -1920 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: li a1, 2 -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: ret +; RV32-LABEL: test3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, 20 +; RV32-NEXT: addi a1, a1, -1920 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 2 +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: li a1, 3 +; RV32-NEXT: sw a1, 8(a0) +; RV32-NEXT: ret ; -; RV64I-LABEL: test3: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: lui a1, 20 -; RV64I-NEXT: addiw a1, a1, -1920 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: li a1, 2 -; RV64I-NEXT: sw a1, 4(a0) -; RV64I-NEXT: li a1, 3 -; RV64I-NEXT: sw a1, 8(a0) -; RV64I-NEXT: ret +; RV64-LABEL: test3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, 20 +; RV64-NEXT: addiw a1, a1, -1920 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: sw a1, 4(a0) +; RV64-NEXT: li a1, 3 +; RV64-NEXT: sw a1, 8(a0) +; RV64-NEXT: ret entry: %splitgep = getelementptr i8, ptr %t, i64 80000 %0 = getelementptr i8, ptr %splitgep, i64 4 @@ -154,34 +227,151 @@ ; Test from PR62734. define void @test4(ptr %dest) { -; RV32I-LABEL: test4: +; CHECK-LABEL: test4: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 4 +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: sb a1, 2044(a0) +; CHECK-NEXT: sb a1, 2045(a0) +; CHECK-NEXT: sb a1, 2046(a0) +; CHECK-NEXT: sb a1, 2047(a0) +; CHECK-NEXT: ret + %p1 = getelementptr i8, ptr %dest, i32 2048 + store i8 1, ptr %p1 + %p2 = getelementptr i8, ptr %dest, i32 2049 + store i8 1, ptr %p2 + %p3 = getelementptr i8, ptr %dest, i32 2050 + store i8 1, ptr %p3 + %p4 = getelementptr i8, ptr %dest, i32 2051 + store i8 1, ptr %p4 + ret void +} + +; Select aligned offset when we can benefit from compressed load/store instructions. +define void @test5(ptr %dest) { +; RV32I-LABEL: test5: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, a0, 1965 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: sw a1, 2035(a0) +; RV32I-NEXT: sw a1, 2039(a0) +; RV32I-NEXT: sw a1, 2043(a0) +; RV32I-NEXT: sw a1, 2047(a0) +; RV32I-NEXT: ret +; +; RV64I-LABEL: test5: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, a0, 1965 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: sw a1, 2035(a0) +; RV64I-NEXT: sw a1, 2039(a0) +; RV64I-NEXT: sw a1, 2043(a0) +; RV64I-NEXT: sw a1, 2047(a0) +; RV64I-NEXT: ret +; +; RV32C-LABEL: test5: +; RV32C: # %bb.0: +; RV32C-NEXT: addi a0, a0, 2047 +; RV32C-NEXT: addi a0, a0, 1905 +; RV32C-NEXT: li a1, 1 +; RV32C-NEXT: sw a1, 48(a0) +; RV32C-NEXT: sw a1, 52(a0) +; RV32C-NEXT: sw a1, 56(a0) +; RV32C-NEXT: sw a1, 60(a0) +; RV32C-NEXT: ret +; +; RV64C-LABEL: test5: +; RV64C: # %bb.0: +; RV64C-NEXT: addi a0, a0, 2047 +; RV64C-NEXT: addi a0, a0, 1905 +; RV64C-NEXT: li a1, 1 +; RV64C-NEXT: sw a1, 48(a0) +; RV64C-NEXT: sw a1, 52(a0) +; RV64C-NEXT: sw a1, 56(a0) +; RV64C-NEXT: sw a1, 60(a0) +; RV64C-NEXT: ret + %p1 = getelementptr i32, ptr %dest, i32 1000 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 1001 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 1002 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 1003 + store i32 1, ptr %p4 + ret void +} + +; FIXME: We can reuse a1 to emit compressed load/store instructions. +define void @test6(ptr %dest) { +; RV32I-LABEL: test6: ; RV32I: # %bb.0: -; RV32I-NEXT: addi a0, a0, 2047 -; RV32I-NEXT: addi a1, a0, 1 +; RV32I-NEXT: addi a1, a0, 5 ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sb a2, 1(a0) -; RV32I-NEXT: sb a2, 1(a1) -; RV32I-NEXT: sb a2, 2(a1) -; RV32I-NEXT: sb a2, 3(a1) +; RV32I-NEXT: sw a2, 2040(a0) +; RV32I-NEXT: sw a2, 2044(a0) +; RV32I-NEXT: sw a2, 2043(a1) +; RV32I-NEXT: sw a2, 2047(a1) ; RV32I-NEXT: ret ; -; RV64I-LABEL: test4: +; RV64I-LABEL: test6: ; RV64I: # %bb.0: -; RV64I-NEXT: addi a0, a0, 2047 -; RV64I-NEXT: addi a1, a0, 1 +; RV64I-NEXT: addi a1, a0, 5 ; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: sb a2, 1(a0) -; RV64I-NEXT: sb a2, 1(a1) -; RV64I-NEXT: sb a2, 2(a1) -; RV64I-NEXT: sb a2, 3(a1) +; RV64I-NEXT: sw a2, 2040(a0) +; RV64I-NEXT: sw a2, 2044(a0) +; RV64I-NEXT: sw a2, 2043(a1) +; RV64I-NEXT: sw a2, 2047(a1) ; RV64I-NEXT: ret - %p1 = getelementptr i8, ptr %dest, i32 2048 +; +; RV32C-LABEL: test6: +; RV32C: # %bb.0: +; RV32C-NEXT: addi a1, a0, 1992 +; RV32C-NEXT: li a2, 1 +; RV32C-NEXT: sw a2, 2040(a0) +; RV32C-NEXT: sw a2, 2044(a0) +; RV32C-NEXT: sw a2, 56(a1) +; RV32C-NEXT: sw a2, 60(a1) +; RV32C-NEXT: ret +; +; RV64C-LABEL: test6: +; RV64C: # %bb.0: +; RV64C-NEXT: addi a1, a0, 1992 +; RV64C-NEXT: li a2, 1 +; RV64C-NEXT: sw a2, 2040(a0) +; RV64C-NEXT: sw a2, 2044(a0) +; RV64C-NEXT: sw a2, 56(a1) +; RV64C-NEXT: sw a2, 60(a1) +; RV64C-NEXT: ret + %p1 = getelementptr i32, ptr %dest, i32 510 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 511 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 512 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 513 + store i32 1, ptr %p4 + ret void +} + +; Select offset divisible by 4096 to use lui + add to obtain the address. +define void @test7(ptr %dest) { +; CHECK-LABEL: test7: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1024 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: sb a1, 1(a0) +; CHECK-NEXT: sb a1, 2(a0) +; CHECK-NEXT: sb a1, 3(a0) +; CHECK-NEXT: sb a1, 4(a0) +; CHECK-NEXT: ret + %p1 = getelementptr i8, ptr %dest, i32 4194305 store i8 1, ptr %p1 - %p2 = getelementptr i8, ptr %dest, i32 2049 + %p2 = getelementptr i8, ptr %dest, i32 4194306 store i8 1, ptr %p2 - %p3 = getelementptr i8, ptr %dest, i32 2050 + %p3 = getelementptr i8, ptr %dest, i32 4194307 store i8 1, ptr %p3 - %p4 = getelementptr i8, ptr %dest, i32 2051 + %p4 = getelementptr i8, ptr %dest, i32 4194308 store i8 1, ptr %p4 ret void }