diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2308,6 +2308,45 @@ return false; } +// Can we benefit from compressed load/store instructions by inferring MemVT +// from all uses of this address? +static bool benefitFromCompressedLoadStore(SDValue Addr, uint64_t Offset, + const RISCVSubtarget *Subtarget) { + if (!isShiftedInt<5, 2>(Offset) && !isShiftedInt<5, 3>(Offset)) + return false; + if (!(Subtarget->hasStdExtC() || Subtarget->hasStdExtZca() || + Subtarget->hasStdExtZcf() || Subtarget->hasStdExtZcd())) + return false; + + for (auto *Use : Addr->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE) + continue; + // Don't allow stores of the value. It must be used as the address. + if (Use->getOpcode() == ISD::STORE && + cast(Use)->getValue() == Addr) + continue; + + EVT VT = cast(Use)->getMemoryVT(); + + if (VT == MVT::i32 && Subtarget->hasStdExtCOrZca() && + isShiftedInt<5, 2>(Offset)) + return true; + if (VT == MVT::i64 && Subtarget->hasStdExtCOrZca() && + isShiftedInt<5, 3>(Offset)) + return true; + if (VT == MVT::f32 && + (Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf()) && + isShiftedInt<5, 2>(Offset)) + return true; + if (VT == MVT::f64 && + (Subtarget->hasStdExtC() || Subtarget->hasStdExtZcd()) && + isShiftedInt<5, 3>(Offset)) + return true; + } + + return false; +} + bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset) { if (SelectAddrFrameIndex(Addr, Base, Offset)) @@ -2322,10 +2361,55 @@ return true; } - if (CurDAG->isBaseWithConstantOffset(Addr)) { + // Decompose the addimm chain into (Base, Offset) pairs + SmallVector, 4> AddrDecomposition; + { + SDValue CurAddr = Addr; + int64_t CurOffset = 0; + while (CurAddr.getOpcode() == ISD::ADD && + isa(CurAddr.getOperand(1))) { + SDValue CurBase = CurAddr.getOperand(0); + CurOffset += cast(CurAddr.getOperand(1))->getSExtValue(); + AddrDecomposition.push_back(std::make_pair(CurBase, CurOffset)); + CurAddr = CurBase; + } + if (AddrDecomposition.empty() && CurDAG->isBaseWithConstantOffset(Addr)) { + AddrDecomposition.push_back(std::make_pair( + Addr.getOperand(0), + cast(Addr.getOperand(1))->getSExtValue())); + } + } + + if (!AddrDecomposition.empty()) { + bool IsWorthFoldingAdd = isWorthFoldingAdd(Addr); + + uint32_t BestCost = std::numeric_limits::max(); + Base = Addr.getOperand(0); int64_t CVal = cast(Addr.getOperand(1))->getSExtValue(); + for (auto [CurBase, CurOffset] : AddrDecomposition) { + uint32_t CurCost = std::numeric_limits::max(); + if (isInt<12>(CurOffset)) { + CurCost = + benefitFromCompressedLoadStore(Addr, CurOffset, Subtarget) ? 2 : 4; + } else if (IsWorthFoldingAdd) { + int64_t Adj = CurOffset < 0 ? -2048 : 2047; + int64_t OffsetVal = CurOffset - Adj; + if (isInt<12>(OffsetVal)) { + CurCost = benefitFromCompressedLoadStore(Addr, OffsetVal, Subtarget) + ? 2 + : 4; + } + } + + // Prefer deeper pair when they cost the same. + if (CurCost <= BestCost) { + BestCost = CurCost; + Base = CurBase; + CVal = CurOffset; + } + } + if (isInt<12>(CVal)) { - Base = Addr.getOperand(0); if (Base.getOpcode() == RISCVISD::ADD_LO) { SDValue LoOperand = Base.getOperand(1); if (auto *GA = dyn_cast(LoOperand)) { @@ -2353,23 +2437,21 @@ Offset = CurDAG->getTargetConstant(CVal, DL, VT); return true; } - } - // Handle ADD with large immediates. - if (Addr.getOpcode() == ISD::ADD && isa(Addr.getOperand(1))) { - int64_t CVal = cast(Addr.getOperand(1))->getSExtValue(); + // Handle ADD chain with large immediates. assert(!isInt<12>(CVal) && "simm12 not already handled?"); - // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use + // Handle immediates in the range [-4096, -2049] or [2048, 4094]. We can use // an ADDI for part of the offset and fold the rest into the load/store. // This mirrors the AddiPair PatFrag in RISCVInstrInfo.td. - if (isInt<12>(CVal / 2) && isInt<12>(CVal - CVal / 2)) { - int64_t Adj = CVal < 0 ? -2048 : 2047; + int64_t Adj = CVal < 0 ? -2048 : 2047; + int64_t OffsetVal = CVal - Adj; + if (IsWorthFoldingAdd && isInt<12>(OffsetVal)) { Base = SDValue( - CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Addr.getOperand(0), + CurDAG->getMachineNode(RISCV::ADDI, DL, VT, Base, CurDAG->getTargetConstant(Adj, DL, VT)), 0); - Offset = CurDAG->getTargetConstant(CVal - Adj, DL, VT); + Offset = CurDAG->getTargetConstant(OffsetVal, DL, VT); return true; } @@ -2379,13 +2461,13 @@ // stores that can fold the lo12 bits. Otherwise, the ADD will get iseled // separately with the full materialized immediate creating extra // instructions. - if (isWorthFoldingAdd(Addr) && - selectConstantAddr(CurDAG, DL, VT, Subtarget, Addr.getOperand(1), Base, - Offset)) { + SDValue AddrBase = Base; + SDValue AddrOffset = CurDAG->getTargetConstant(CVal, DL, VT); + if (IsWorthFoldingAdd && selectConstantAddr(CurDAG, DL, VT, Subtarget, + AddrOffset, Base, Offset)) { // Insert an ADD instruction with the materialized Hi52 bits. - Base = SDValue( - CurDAG->getMachineNode(RISCV::ADD, DL, VT, Addr.getOperand(0), Base), - 0); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADD, DL, VT, AddrBase, Base), + 0); return true; } } diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll --- a/llvm/test/CodeGen/RISCV/split-offsets.ll +++ b/llvm/test/CodeGen/RISCV/split-offsets.ll @@ -1,43 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32,RV32I ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV64I +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32,RV32C +; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64,RV64C ; Check that memory accesses to array elements with large offsets have those ; offsets split into a base offset, plus a smaller offset that is folded into ; the memory operation. We should also only compute that base offset once, ; since it can be shared for all memory operations in this test. define void @test1(ptr %sp, ptr %t, i32 %n) { -; RV32I-LABEL: test1: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lw a0, 0(a0) -; RV32I-NEXT: lui a2, 20 -; RV32I-NEXT: addi a2, a2, -1920 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: add a0, a0, a2 -; RV32I-NEXT: li a2, 2 -; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: li a3, 1 -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a3, 0(a1) -; RV32I-NEXT: sw a2, 4(a1) -; RV32I-NEXT: ret +; RV32-LABEL: test1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lui a2, 20 +; RV32-NEXT: addi a2, a2, -1920 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: li a2, 2 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: li a3, 1 +; RV32-NEXT: sw a3, 4(a0) +; RV32-NEXT: sw a3, 0(a1) +; RV32-NEXT: sw a2, 4(a1) +; RV32-NEXT: ret ; -; RV64I-LABEL: test1: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: ld a0, 0(a0) -; RV64I-NEXT: lui a2, 20 -; RV64I-NEXT: addiw a2, a2, -1920 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: add a0, a0, a2 -; RV64I-NEXT: li a2, 2 -; RV64I-NEXT: sw a2, 0(a0) -; RV64I-NEXT: li a3, 1 -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a3, 0(a1) -; RV64I-NEXT: sw a2, 4(a1) -; RV64I-NEXT: ret +; RV64-LABEL: test1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: lui a2, 20 +; RV64-NEXT: addiw a2, a2, -1920 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: li a2, 2 +; RV64-NEXT: sw a2, 0(a0) +; RV64-NEXT: li a3, 1 +; RV64-NEXT: sw a3, 4(a0) +; RV64-NEXT: sw a3, 0(a1) +; RV64-NEXT: sw a2, 4(a1) +; RV64-NEXT: ret entry: %s = load ptr, ptr %sp %gep0 = getelementptr [65536 x i32], ptr %s, i64 0, i32 20000 @@ -53,48 +57,48 @@ ; Ditto. Check it when the GEPs are not in the entry block. define void @test2(ptr %sp, ptr %t, i32 %n) { -; RV32I-LABEL: test2: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: li a3, 0 -; RV32I-NEXT: lw a0, 0(a0) -; RV32I-NEXT: lui a4, 20 -; RV32I-NEXT: addi a4, a4, -1920 -; RV32I-NEXT: add a1, a1, a4 -; RV32I-NEXT: add a0, a0, a4 -; RV32I-NEXT: blez a2, .LBB1_2 -; RV32I-NEXT: .LBB1_1: # %while_body -; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: addi a4, a3, 1 -; RV32I-NEXT: sw a4, 0(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a4, 0(a1) -; RV32I-NEXT: sw a3, 4(a1) -; RV32I-NEXT: mv a3, a4 -; RV32I-NEXT: blt a4, a2, .LBB1_1 -; RV32I-NEXT: .LBB1_2: # %while_end -; RV32I-NEXT: ret +; RV32-LABEL: test2: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a3, 0 +; RV32-NEXT: lw a0, 0(a0) +; RV32-NEXT: lui a4, 20 +; RV32-NEXT: addi a4, a4, -1920 +; RV32-NEXT: add a1, a1, a4 +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: blez a2, .LBB1_2 +; RV32-NEXT: .LBB1_1: # %while_body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: addi a4, a3, 1 +; RV32-NEXT: sw a4, 0(a0) +; RV32-NEXT: sw a3, 4(a0) +; RV32-NEXT: sw a4, 0(a1) +; RV32-NEXT: sw a3, 4(a1) +; RV32-NEXT: mv a3, a4 +; RV32-NEXT: blt a4, a2, .LBB1_1 +; RV32-NEXT: .LBB1_2: # %while_end +; RV32-NEXT: ret ; -; RV64I-LABEL: test2: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: li a3, 0 -; RV64I-NEXT: ld a0, 0(a0) -; RV64I-NEXT: lui a4, 20 -; RV64I-NEXT: addiw a4, a4, -1920 -; RV64I-NEXT: add a1, a1, a4 -; RV64I-NEXT: add a0, a0, a4 -; RV64I-NEXT: sext.w a2, a2 -; RV64I-NEXT: blez a2, .LBB1_2 -; RV64I-NEXT: .LBB1_1: # %while_body -; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV64I-NEXT: addiw a4, a3, 1 -; RV64I-NEXT: sw a4, 0(a0) -; RV64I-NEXT: sw a3, 4(a0) -; RV64I-NEXT: sw a4, 0(a1) -; RV64I-NEXT: sw a3, 4(a1) -; RV64I-NEXT: mv a3, a4 -; RV64I-NEXT: blt a4, a2, .LBB1_1 -; RV64I-NEXT: .LBB1_2: # %while_end -; RV64I-NEXT: ret +; RV64-LABEL: test2: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a3, 0 +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: lui a4, 20 +; RV64-NEXT: addiw a4, a4, -1920 +; RV64-NEXT: add a1, a1, a4 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: sext.w a2, a2 +; RV64-NEXT: blez a2, .LBB1_2 +; RV64-NEXT: .LBB1_1: # %while_body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: addiw a4, a3, 1 +; RV64-NEXT: sw a4, 0(a0) +; RV64-NEXT: sw a3, 4(a0) +; RV64-NEXT: sw a4, 0(a1) +; RV64-NEXT: sw a3, 4(a1) +; RV64-NEXT: mv a3, a4 +; RV64-NEXT: blt a4, a2, .LBB1_1 +; RV64-NEXT: .LBB1_2: # %while_end +; RV64-NEXT: ret entry: %s = load ptr, ptr %sp br label %while_cond @@ -122,27 +126,27 @@ ; instructions. Make sure we use an offset and common base for each of the ; stores. define void @test3(ptr %t) { -; RV32I-LABEL: test3: -; RV32I: # %bb.0: # %entry -; RV32I-NEXT: lui a1, 20 -; RV32I-NEXT: addi a1, a1, -1920 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: li a1, 2 -; RV32I-NEXT: sw a1, 4(a0) -; RV32I-NEXT: li a1, 3 -; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: ret +; RV32-LABEL: test3: +; RV32: # %bb.0: # %entry +; RV32-NEXT: lui a1, 20 +; RV32-NEXT: addi a1, a1, -1920 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 2 +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: li a1, 3 +; RV32-NEXT: sw a1, 8(a0) +; RV32-NEXT: ret ; -; RV64I-LABEL: test3: -; RV64I: # %bb.0: # %entry -; RV64I-NEXT: lui a1, 20 -; RV64I-NEXT: addiw a1, a1, -1920 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: li a1, 2 -; RV64I-NEXT: sw a1, 4(a0) -; RV64I-NEXT: li a1, 3 -; RV64I-NEXT: sw a1, 8(a0) -; RV64I-NEXT: ret +; RV64-LABEL: test3: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a1, 20 +; RV64-NEXT: addiw a1, a1, -1920 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: sw a1, 4(a0) +; RV64-NEXT: li a1, 3 +; RV64-NEXT: sw a1, 8(a0) +; RV64-NEXT: ret entry: %splitgep = getelementptr i8, ptr %t, i64 80000 %0 = getelementptr i8, ptr %splitgep, i64 4 @@ -154,27 +158,15 @@ ; Test from PR62734. define void @test4(ptr %dest) { -; RV32I-LABEL: test4: -; RV32I: # %bb.0: -; RV32I-NEXT: addi a0, a0, 2047 -; RV32I-NEXT: addi a1, a0, 1 -; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: sb a2, 1(a0) -; RV32I-NEXT: sb a2, 1(a1) -; RV32I-NEXT: sb a2, 2(a1) -; RV32I-NEXT: sb a2, 3(a1) -; RV32I-NEXT: ret -; -; RV64I-LABEL: test4: -; RV64I: # %bb.0: -; RV64I-NEXT: addi a0, a0, 2047 -; RV64I-NEXT: addi a1, a0, 1 -; RV64I-NEXT: li a2, 1 -; RV64I-NEXT: sb a2, 1(a0) -; RV64I-NEXT: sb a2, 1(a1) -; RV64I-NEXT: sb a2, 2(a1) -; RV64I-NEXT: sb a2, 3(a1) -; RV64I-NEXT: ret +; CHECK-LABEL: test4: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 2047 +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: sb a1, 1(a0) +; CHECK-NEXT: sb a1, 2(a0) +; CHECK-NEXT: sb a1, 3(a0) +; CHECK-NEXT: sb a1, 4(a0) +; CHECK-NEXT: ret %p1 = getelementptr i8, ptr %dest, i32 2048 store i8 1, ptr %p1 %p2 = getelementptr i8, ptr %dest, i32 2049 @@ -185,3 +177,147 @@ store i8 1, ptr %p4 ret void } + +; Don't fold addimm chains when we can benefit from compressed load/store instructions. +define void @test5(ptr %dest) { +; RV32I-LABEL: test5: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, a0, 2047 +; RV32I-NEXT: li a1, 1 +; RV32I-NEXT: sw a1, 1953(a0) +; RV32I-NEXT: sw a1, 1957(a0) +; RV32I-NEXT: sw a1, 1961(a0) +; RV32I-NEXT: sw a1, 1965(a0) +; RV32I-NEXT: ret +; +; RV64I-LABEL: test5: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, a0, 2047 +; RV64I-NEXT: li a1, 1 +; RV64I-NEXT: sw a1, 1953(a0) +; RV64I-NEXT: sw a1, 1957(a0) +; RV64I-NEXT: sw a1, 1961(a0) +; RV64I-NEXT: sw a1, 1965(a0) +; RV64I-NEXT: ret +; +; RV32C-LABEL: test5: +; RV32C: # %bb.0: +; RV32C-NEXT: addi a0, a0, 2047 +; RV32C-NEXT: addi a0, a0, 1953 +; RV32C-NEXT: li a1, 1 +; RV32C-NEXT: sw a1, 0(a0) +; RV32C-NEXT: sw a1, 4(a0) +; RV32C-NEXT: sw a1, 8(a0) +; RV32C-NEXT: sw a1, 12(a0) +; RV32C-NEXT: ret +; +; RV64C-LABEL: test5: +; RV64C: # %bb.0: +; RV64C-NEXT: addi a0, a0, 2047 +; RV64C-NEXT: addi a0, a0, 1953 +; RV64C-NEXT: li a1, 1 +; RV64C-NEXT: sw a1, 0(a0) +; RV64C-NEXT: sw a1, 4(a0) +; RV64C-NEXT: sw a1, 8(a0) +; RV64C-NEXT: sw a1, 12(a0) +; RV64C-NEXT: ret + %p1 = getelementptr i32, ptr %dest, i32 1000 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 1001 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 1002 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 1003 + store i32 1, ptr %p4 + ret void +} + +; FIXME: We can reuse a1 to emit compressed load/store instructions. +define void @test6(ptr %dest) { +; RV32I-LABEL: test6: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a1, a0, 2047 +; RV32I-NEXT: li a2, 1 +; RV32I-NEXT: sw a2, 2040(a0) +; RV32I-NEXT: sw a2, 2044(a0) +; RV32I-NEXT: sw a2, 1(a1) +; RV32I-NEXT: sw a2, 5(a1) +; RV32I-NEXT: ret +; +; RV64I-LABEL: test6: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a1, a0, 2047 +; RV64I-NEXT: li a2, 1 +; RV64I-NEXT: sw a2, 2040(a0) +; RV64I-NEXT: sw a2, 2044(a0) +; RV64I-NEXT: sw a2, 1(a1) +; RV64I-NEXT: sw a2, 5(a1) +; RV64I-NEXT: ret +; +; RV32C-LABEL: test6: +; RV32C: # %bb.0: +; RV32C-NEXT: addi a1, a0, 2047 +; RV32C-NEXT: addi a1, a1, 1 +; RV32C-NEXT: li a2, 1 +; RV32C-NEXT: sw a2, 2040(a0) +; RV32C-NEXT: sw a2, 2044(a0) +; RV32C-NEXT: sw a2, 0(a1) +; RV32C-NEXT: sw a2, 4(a1) +; RV32C-NEXT: ret +; +; RV64C-LABEL: test6: +; RV64C: # %bb.0: +; RV64C-NEXT: addi a1, a0, 2047 +; RV64C-NEXT: addi a1, a1, 1 +; RV64C-NEXT: li a2, 1 +; RV64C-NEXT: sw a2, 2040(a0) +; RV64C-NEXT: sw a2, 2044(a0) +; RV64C-NEXT: sw a2, 0(a1) +; RV64C-NEXT: sw a2, 4(a1) +; RV64C-NEXT: ret + %p1 = getelementptr i32, ptr %dest, i32 510 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 511 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 512 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 513 + store i32 1, ptr %p4 + ret void +} + +; Don't fold addimm chains when the offset is large. +define void @test7(ptr %dest) { +; RV32-LABEL: test7: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, 10 +; RV32-NEXT: addi a1, a1, -960 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 1 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: sw a1, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: test7: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 10 +; RV64-NEXT: addiw a1, a1, -960 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 1 +; RV64-NEXT: sw a1, 0(a0) +; RV64-NEXT: sw a1, 4(a0) +; RV64-NEXT: sw a1, 8(a0) +; RV64-NEXT: sw a1, 12(a0) +; RV64-NEXT: ret + %p1 = getelementptr i32, ptr %dest, i32 10000 + store i32 1, ptr %p1 + %p2 = getelementptr i32, ptr %dest, i32 10001 + store i32 1, ptr %p2 + %p3 = getelementptr i32, ptr %dest, i32 10002 + store i32 1, ptr %p3 + %p4 = getelementptr i32, ptr %dest, i32 10003 + store i32 1, ptr %p4 + ret void +}