diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4526,6 +4526,14 @@ Group, HelpText<"Do not add a BTI instruction after a setjmp or other" " return-twice construct (Arm/AArch64 only)">; +def aarch64_ldp_policy_EQ : Joined<["-"], "aarch64-ldp-policy=">, + Group, + Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, + HelpText<"Fine-grained load pair policy (AArch64 only)">; +def aarch64_stp_policy_EQ : Joined<["-"], "aarch64-stp-policy=">, + Group, + Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>, + HelpText<"Fine-grained store pair policy (AArch64 only)">; foreach i = {1-31} in def ffixed_x#i : Flag<["-"], "ffixed-x"#i>, Group, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1846,6 +1846,32 @@ } AddUnalignedAccessWarning(CmdArgs); + + // Handle -aarch64-ldp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_ldp_policy_EQ)) { + StringRef Val = A->getValue(); + const Driver &D = getToolChain().getDriver(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-ldp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString("-aarch64-ldp-policy=" + Val)); + } + + // Handle -aarch64-stp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_stp_policy_EQ)) { + StringRef Val = A->getValue(); + const Driver &D = getToolChain().getDriver(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-stp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString("-aarch64-stp-policy=" + Val)); + } } void Clang::AddLoongArchTargetArgs(const ArgList &Args, diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -867,6 +867,30 @@ addMachineOutlinerArgs(D, Args, CmdArgs, ToolChain.getEffectiveTriple(), /*IsLTO=*/true, PluginOptPrefix); + + // Handle -aarch64-ldp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_ldp_policy_EQ)) { + StringRef Val = A->getValue(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-ldp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + + "-aarch64-ldp-policy=" + Val)); + } + + // Handle -aarch64-stp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_stp_policy_EQ)) { + StringRef Val = A->getValue(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-stp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back(Args.MakeArgString(Twine(PluginOptPrefix) + + "-aarch64-stp-policy=" + Val)); + } } void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC, diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -195,6 +195,32 @@ break; } + // Handle -aarch64-ldp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_ldp_policy_EQ)) { + StringRef Val = A->getValue(); + const Driver &D = getToolChain().getDriver(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-ldp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString("-aarch64-ldp-policy=" + Val)); + } + + // Handle -aarch64-stp-policy= + if (Arg *A = Args.getLastArg(options::OPT_aarch64_stp_policy_EQ)) { + StringRef Val = A->getValue(); + const Driver &D = getToolChain().getDriver(); + if (!Val.equals("aligned") && !Val.equals("never") && + !Val.equals("always") && !Val.equals("default")) + // Handle the unsupported values passed to aarch64-stp-policy. + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getSpelling() << Val; + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString("-aarch64-stp-policy=" + Val)); + } + // TODO: Add target specific flags, ABI, mtune option etc. } diff --git a/clang/test/Driver/aarch64-ldp-policy.c b/clang/test/Driver/aarch64-ldp-policy.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/aarch64-ldp-policy.c @@ -0,0 +1,13 @@ +// RUN: %clang -### -target aarch64 -aarch64-ldp-policy=always %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALWAYS %s +// RUN: %clang -### -target aarch64 -aarch64-ldp-policy=aligned %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALIGNED %s +// RUN: %clang -### -target aarch64 -aarch64-ldp-policy=never %s -c 2>&1 | FileCheck -check-prefix=CHECK-NEVER %s +// RUN: %clang -### -target aarch64 -aarch64-ldp-policy=default %s -c 2>&1 | FileCheck -check-prefix=CHECK-DEFAULT %s +// RUN: not %clang -### -target aarch64 -aarch64-ldp-policy=def %s -c 2>&1 | FileCheck -check-prefix=CHECK-ARGUMENT %s +// RUN: not %clang -c -target x86-64 -aarch64-ldp-policy=aligned %s 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s + +// CHECK-ALWAYS: "-aarch64-ldp-policy=always" +// CHECK-ALIGNED: "-aarch64-ldp-policy=aligned" +// CHECK-NEVER: "-aarch64-ldp-policy=never" +// CHECK-DEFAULT: "-aarch64-ldp-policy=default" +// CHECK-ARGUMENT: clang: error: unsupported argument 'def' to option '-aarch64-ldp-policy=' +// CHECK-TRIPLE: clang: error: unsupported option '-aarch64-ldp-policy=' for target diff --git a/clang/test/Driver/aarch64-stp-policy.c b/clang/test/Driver/aarch64-stp-policy.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/aarch64-stp-policy.c @@ -0,0 +1,13 @@ +// RUN: %clang -### -target aarch64 -aarch64-stp-policy=always %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALWAYS %s +// RUN: %clang -### -target aarch64 -aarch64-stp-policy=aligned %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALIGNED %s +// RUN: %clang -### -target aarch64 -aarch64-stp-policy=never %s -c 2>&1 | FileCheck -check-prefix=CHECK-NEVER %s +// RUN: %clang -### -target aarch64 -aarch64-stp-policy=default %s -c 2>&1 | FileCheck -check-prefix=CHECK-DEFAULT %s +// RUN: not %clang -### -target aarch64 -aarch64-stp-policy=def %s -c 2>&1 | FileCheck -check-prefix=CHECK-ARGUMENT %s +// RUN: not %clang -c -target x86-64 -aarch64-stp-policy=aligned %s 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s + +// CHECK-ALWAYS: "-aarch64-stp-policy=always" +// CHECK-ALIGNED: "-aarch64-stp-policy=aligned" +// CHECK-NEVER: "-aarch64-stp-policy=never" +// CHECK-DEFAULT: "-aarch64-stp-policy=default" +// CHECK-ARGUMENT: clang: error: unsupported argument 'def' to option '-aarch64-stp-policy=' +// CHECK-TRIPLE: clang: error: unsupported option '-aarch64-stp-policy=' for target diff --git a/clang/test/Driver/flang/aarch64-ldp-policy.f90 b/clang/test/Driver/flang/aarch64-ldp-policy.f90 new file mode 100644 --- /dev/null +++ b/clang/test/Driver/flang/aarch64-ldp-policy.f90 @@ -0,0 +1,11 @@ +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-ldp-policy=always %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALWAYS %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-ldp-policy=aligned %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALIGNED %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-ldp-policy=never %s -c 2>&1 | FileCheck -check-prefix=CHECK-NEVER %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-ldp-policy=default %s -c 2>&1 | FileCheck -check-prefix=CHECK-DEFAULT %s +! RUN: not %clang -### --driver-mode=flang -target aarch64 -aarch64-ldp-policy=def %s -c 2>&1 | FileCheck -check-prefix=CHECK-ARGUMENT %s + +! CHECK-ALWAYS: "-aarch64-ldp-policy=always" +! CHECK-ALIGNED: "-aarch64-ldp-policy=aligned" +! CHECK-NEVER: "-aarch64-ldp-policy=never" +! CHECK-DEFAULT: "-aarch64-ldp-policy=default" +! CHECK-ARGUMENT: clang: error: unsupported argument 'def' to option '-aarch64-ldp-policy=' diff --git a/clang/test/Driver/flang/aarch64-stp-policy.f90 b/clang/test/Driver/flang/aarch64-stp-policy.f90 new file mode 100644 --- /dev/null +++ b/clang/test/Driver/flang/aarch64-stp-policy.f90 @@ -0,0 +1,11 @@ +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-stp-policy=always %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALWAYS %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-stp-policy=aligned %s -c 2>&1 | FileCheck -check-prefix=CHECK-ALIGNED %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-stp-policy=never %s -c 2>&1 | FileCheck -check-prefix=CHECK-NEVER %s +! RUN: %clang -### --driver-mode=flang -target aarch64 -aarch64-stp-policy=default %s -c 2>&1 | FileCheck -check-prefix=CHECK-DEFAULT %s +! RUN: not %clang -### --driver-mode=flang -target aarch64 -aarch64-stp-policy=def %s -c 2>&1 | FileCheck -check-prefix=CHECK-ARGUMENT %s + +! CHECK-ALWAYS: "-aarch64-stp-policy=always" +! CHECK-ALIGNED: "-aarch64-stp-policy=aligned" +! CHECK-NEVER: "-aarch64-stp-policy=never" +! CHECK-DEFAULT: "-aarch64-stp-policy=default" +! CHECK-ARGUMENT: clang: error: unsupported argument 'def' to option '-aarch64-stp-policy=' diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -67,6 +67,42 @@ DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", "Controls which pairs are considered for renaming"); +enum LdpPolicy { + LDP_POLICY_ALWAYS, ///< Emit ldp regardless of alignment. + LDP_POLICY_NEVER, ///< Do not emit ldp. + LDP_POLICY_ALIGNED ///< In order to emit ldp, first check if the load will + ///< be aligned to 2 * element_size. +}; + +enum StpPolicy { + STP_POLICY_ALWAYS, ///< Emit stp regardless of alignment. + STP_POLICY_NEVER, ///< Do not emit stp. + STP_POLICY_ALIGNED ///< In order to emit stp, first check if the store will + ///< be aligned to 2 * element_size. +}; + +static cl::opt AArch64LdpPolicy( + "aarch64-ldp-policy", cl::Optional, cl::init(LDP_POLICY_ALWAYS), + cl::desc("AArch64 Specific: Load pair policy."), + cl::values(clEnumValN(LDP_POLICY_NEVER, "never", "Do not emit ldp."), + clEnumValN(LDP_POLICY_ALIGNED, "aligned", + "Emit ldp only if the source pointer is aligned to " + "at least double the alignment of the type."), + clEnumValN(LDP_POLICY_ALWAYS, "always", + "Emit ldp regardless of alignment. (default)"), + clEnumValN(LDP_POLICY_ALWAYS, "default", "Use the default."))); + +static cl::opt AArch64StpPolicy( + "aarch64-stp-policy", cl::Optional, cl::init(STP_POLICY_ALWAYS), + cl::desc("AArch64 Specific: Store pair policy."), + cl::values(clEnumValN(STP_POLICY_NEVER, "never", "Do not emit stp."), + clEnumValN(STP_POLICY_ALIGNED, "aligned", + "Emit stp only if the source pointer is aligned to " + "at least double the alignment of the type."), + clEnumValN(STP_POLICY_ALWAYS, "always", + "Emit stp regardless of alignment. (default)"), + clEnumValN(STP_POLICY_ALWAYS, "default", "Use the default."))); + // The LdStLimit limits how far we search for load/store pairs. static cl::opt LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); @@ -2136,6 +2172,16 @@ if (!TII->isCandidateToMergeOrPair(MI)) return false; + // Fetch the memoperand of the load/store that is a candidate for combination. + MachineMemOperand *memOp = MBBI->memoperands().data()[0]; + + // If a load arrives and LDP_POLICY_NEVER is opted, do not emit ldp. + if (memOp->isLoad() && AArch64LdpPolicy == LDP_POLICY_NEVER) + return false; + // If a store arrives and STP_POLICY_NEVER is opted, do not emit stp. + if (memOp->isStore() && AArch64StpPolicy == STP_POLICY_NEVER) + return false; + // Early exit if the offset is not possible to match. (6 bits of positive // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) @@ -2159,6 +2205,24 @@ // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. auto Prev = std::prev(MBBI); + + // Get the needed alignments to check them if + // LDP_POLICY_ALIGNED/STP_POLICY_ALIGNED is opted. + uint64_t memAlignment = memOp->getAlign().value(); + uint64_t typeAlignment = Align(memOp->getSize()).value(); + + // If a load arrives and LDP_POLICY_ALIGNED is opted, check that the + // alignment of the source pointer is at least double the alignment of the + // type. + if (memOp->isLoad() && AArch64LdpPolicy == LDP_POLICY_ALIGNED && + memAlignment < 2 * typeAlignment) + return false; + // If a store arrives and STP_POLICY_ALIGNED is opted, check that the + // alignment of the source pointer is at least double the alignment of the + // type. + if (memOp->isStore() && AArch64StpPolicy == STP_POLICY_ALIGNED && + memAlignment < 2 * typeAlignment) + return false; MBBI = mergePairedInsns(MBBI, Paired, Flags); // Collect liveness info for instructions between Prev and the new position // MBBI. diff --git a/llvm/test/CodeGen/AArch64/ldp-aligned.ll b/llvm/test/CodeGen/AArch64/ldp-aligned.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldp-aligned.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-ldp-policy=aligned | FileCheck %s + +define i32 @ldp_aligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldp w9, w8, [x8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = load i32, ptr %4, align 64 + %6 = getelementptr inbounds i32, ptr %4, i64 1 + %7 = load i32, ptr %6, align 4 + %8 = add nsw i32 %7, %5 + ret i32 %8 +} + +define i64 @ldp_aligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldp x9, x8, [x8] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = load i64, ptr %4, align 128 + %6 = getelementptr inbounds i64, ptr %4, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = add nsw i64 %7, %5 + ret i64 %8 +} + +define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldp q0, q1, [x8] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = load <4 x i32>, ptr %4, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %7 = load <4 x i32>, ptr %6, align 16 + %8 = add <4 x i32> %7, %5 + ret <4 x i32> %8 +} + +define i32 @ldp_unaligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldr w9, [x8, #4] +; CHECK-NEXT: ldr w8, [x8, #8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i32, ptr %4, i64 1 + %6 = load i32, ptr %5, align 4 + %7 = getelementptr inbounds i32, ptr %4, i64 2 + %8 = load i32, ptr %7, align 8 + %9 = add nsw i32 %8, %6 + ret i32 %9 +} + +define i64 @ldp_unaligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: ldr x8, [x8, #16] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i64, ptr %4, i64 1 + %6 = load i64, ptr %5, align 8 + %7 = getelementptr inbounds i64, ptr %4, i64 2 + %8 = load i64, ptr %7, align 16 + %9 = add nsw i64 %8, %6 + ret i64 %9 +} + +define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: ldr q1, [x8, #32] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %6 = load <4 x i32>, ptr %5, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2 + %8 = load <4 x i32>, ptr %7, align 32 + %9 = add <4 x i32> %8, %6 + ret <4 x i32> %9 +} diff --git a/llvm/test/CodeGen/AArch64/ldp-always.ll b/llvm/test/CodeGen/AArch64/ldp-always.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldp-always.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-ldp-policy=always | FileCheck %s + +define i32 @ldp_aligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldp w9, w8, [x8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = load i32, ptr %4, align 64 + %6 = getelementptr inbounds i32, ptr %4, i64 1 + %7 = load i32, ptr %6, align 4 + %8 = add nsw i32 %7, %5 + ret i32 %8 +} + +define i64 @ldp_aligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldp x9, x8, [x8] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = load i64, ptr %4, align 128 + %6 = getelementptr inbounds i64, ptr %4, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = add nsw i64 %7, %5 + ret i64 %8 +} + +define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldp q0, q1, [x8] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = load <4 x i32>, ptr %4, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %7 = load <4 x i32>, ptr %6, align 16 + %8 = add <4 x i32> %7, %5 + ret <4 x i32> %8 +} + +define i32 @ldp_unaligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldp w9, w8, [x8, #4] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i32, ptr %4, i64 1 + %6 = load i32, ptr %5, align 4 + %7 = getelementptr inbounds i32, ptr %4, i64 2 + %8 = load i32, ptr %7, align 8 + %9 = add nsw i32 %8, %6 + ret i32 %9 +} + +define i64 @ldp_unaligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldp x9, x8, [x8, #8] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i64, ptr %4, i64 1 + %6 = load i64, ptr %5, align 8 + %7 = getelementptr inbounds i64, ptr %4, i64 2 + %8 = load i64, ptr %7, align 16 + %9 = add nsw i64 %8, %6 + ret i64 %9 +} + +define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldp q0, q1, [x8, #16] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %6 = load <4 x i32>, ptr %5, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2 + %8 = load <4 x i32>, ptr %7, align 32 + %9 = add <4 x i32> %8, %6 + ret <4 x i32> %9 +} + diff --git a/llvm/test/CodeGen/AArch64/ldp-never.ll b/llvm/test/CodeGen/AArch64/ldp-never.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldp-never.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-ldp-policy=never | FileCheck %s + +define i32 @ldp_aligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: ldr w8, [x8, #4] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = load i32, ptr %4, align 64 + %6 = getelementptr inbounds i32, ptr %4, i64 1 + %7 = load i32, ptr %6, align 4 + %8 = add nsw i32 %7, %5 + ret i32 %8 +} + +define i64 @ldp_aligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: ldr x8, [x8, #8] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = load i64, ptr %4, align 128 + %6 = getelementptr inbounds i64, ptr %4, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = add nsw i64 %7, %5 + ret i64 %8 +} + +define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldr q0, [x8] +; CHECK-NEXT: ldr q1, [x8, #16] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = load <4 x i32>, ptr %4, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %7 = load <4 x i32>, ptr %6, align 16 + %8 = add <4 x i32> %7, %5 + ret <4 x i32> %8 +} + +define i32 @ldp_unaligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldr w9, [x8, #4] +; CHECK-NEXT: ldr w8, [x8, #8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i32, ptr %4, i64 1 + %6 = load i32, ptr %5, align 4 + %7 = getelementptr inbounds i32, ptr %4, i64 2 + %8 = load i32, ptr %7, align 8 + %9 = add nsw i32 %8, %6 + ret i32 %9 +} + +define i64 @ldp_unaligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: ldr x8, [x8, #16] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i64, ptr %4, i64 1 + %6 = load i64, ptr %5, align 8 + %7 = getelementptr inbounds i64, ptr %4, i64 2 + %8 = load i64, ptr %7, align 16 + %9 = add nsw i64 %8, %6 + ret i64 %9 +} + +define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: ldr q1, [x8, #32] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %6 = load <4 x i32>, ptr %5, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2 + %8 = load <4 x i32>, ptr %7, align 32 + %9 = add <4 x i32> %8, %6 + ret <4 x i32> %9 +} + diff --git a/llvm/test/CodeGen/AArch64/stp-aligned.ll b/llvm/test/CodeGen/AArch64/stp-aligned.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stp-aligned.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=aligned | FileCheck %s + +define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-NEXT: stp w1, w1, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + store i32 %1, ptr %5, align 64 + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + ret ptr %5 +} + +define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-NEXT: stp x1, x1, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + store i64 %1, ptr %5, align 128 + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + ret ptr %5 +} + +define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + store <4 x i32> %1, ptr %5, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + ret ptr %5 +} + +define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: orr x0, x8, #0x4 +; CHECK-NEXT: str w1, [x8, #4] +; CHECK-NEXT: str w1, [x8, #8] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + %7 = getelementptr inbounds i32, ptr %5, i64 2 + store i32 %1, ptr %7, align 8 + ret ptr %6 +} + +define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: orr x0, x8, #0x8 +; CHECK-NEXT: str x1, [x8, #8] +; CHECK-NEXT: str x1, [x8, #16] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + %7 = getelementptr inbounds i64, ptr %5, i64 2 + store i64 %1, ptr %7, align 16 + ret ptr %6 +} + +define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: orr x0, x8, #0x10 +; CHECK-NEXT: str q0, [x8, #16] +; CHECK-NEXT: str q0, [x8, #32] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2 + store <4 x i32> %1, ptr %7, align 32 + ret ptr %6 +} + diff --git a/llvm/test/CodeGen/AArch64/stp-always.ll b/llvm/test/CodeGen/AArch64/stp-always.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stp-always.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=always | FileCheck %s + +define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-NEXT: stp w1, w1, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + store i32 %1, ptr %5, align 64 + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + ret ptr %5 +} + +define ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-NEXT: stp x1, x1, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + store i64 %1, ptr %5, align 128 + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + ret ptr %5 +} + +define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + store <4 x i32> %1, ptr %5, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + ret ptr %5 +} + +define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: orr x0, x8, #0x4 +; CHECK-NEXT: stp w1, w1, [x8, #4] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + %7 = getelementptr inbounds i32, ptr %5, i64 2 + store i32 %1, ptr %7, align 8 + ret ptr %6 +} + +define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: orr x0, x8, #0x8 +; CHECK-NEXT: stp x1, x1, [x8, #8] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + %7 = getelementptr inbounds i64, ptr %5, i64 2 + store i64 %1, ptr %7, align 16 + ret ptr %6 +} + +define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: orr x0, x8, #0x10 +; CHECK-NEXT: stp q0, q0, [x8, #16] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2 + store <4 x i32> %1, ptr %7, align 32 + ret ptr %6 +} + diff --git a/llvm/test/CodeGen/AArch64/stp-never.ll b/llvm/test/CodeGen/AArch64/stp-never.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stp-never.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=generic -aarch64-stp-policy=never | FileCheck %s + +define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-NEXT: str w1, [x0] +; CHECK-NEXT: str w1, [x0, #4] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + store i32 %1, ptr %5, align 64 + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + ret ptr %5 +} + +define ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-NEXT: str x1, [x0] +; CHECK-NEXT: str x1, [x0, #8] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + store i64 %1, ptr %5, align 128 + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + ret ptr %5 +} + +define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: str q0, [x0, #16] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + store <4 x i32> %1, ptr %5, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + ret ptr %5 +} + +define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: orr x0, x8, #0x4 +; CHECK-NEXT: str w1, [x8, #4] +; CHECK-NEXT: str w1, [x8, #8] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + %7 = getelementptr inbounds i32, ptr %5, i64 2 + store i32 %1, ptr %7, align 8 + ret ptr %6 +} + +define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: orr x0, x8, #0x8 +; CHECK-NEXT: str x1, [x8, #8] +; CHECK-NEXT: str x1, [x8, #16] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + %7 = getelementptr inbounds i64, ptr %5, i64 2 + store i64 %1, ptr %7, align 16 + ret ptr %6 +} + +define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: orr x0, x8, #0x10 +; CHECK-NEXT: str q0, [x8, #16] +; CHECK-NEXT: str q0, [x8, #32] +; CHECK-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2 + store <4 x i32> %1, ptr %7, align 32 + ret ptr %6 +} +