diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp --- a/clang-tools-extra/clang-doc/Serialize.cpp +++ b/clang-tools-extra/clang-doc/Serialize.cpp @@ -168,7 +168,7 @@ } bool ClangDocCommentVisitor::isWhitespaceOnly(llvm::StringRef S) const { - return std::all_of(S.begin(), S.end(), isspace); + return llvm::all_of(S, isspace); } std::string ClangDocCommentVisitor::getCommandName(unsigned CommandID) const { diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp --- a/clang-tools-extra/clang-move/Move.cpp +++ b/clang-tools-extra/clang-move/Move.cpp @@ -920,8 +920,7 @@ return false; } }; - if (std::none_of(UnremovedDeclsInOldHeader.begin(), - UnremovedDeclsInOldHeader.end(), IsSupportedKind) && + if (llvm::none_of(UnremovedDeclsInOldHeader, IsSupportedKind) && !Context->Spec.OldHeader.empty()) { auto &SM = RemovedDecls[0]->getASTContext().getSourceManager(); moveAll(SM, Context->Spec.OldHeader, Context->Spec.NewHeader); diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp --- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp @@ -38,14 +38,10 @@ // set of reserved characters. See: // https://www.unicode.org/reports/tr35/tr35.html#Invalid_Patterns bool isValidDatePattern(StringRef Pattern) { - for (auto &PatternChar : Pattern) { - if (isalpha(PatternChar)) { - if (!llvm::is_contained(ValidDatePatternChars, PatternChar)) { - return false; - } - } - } - return true; + return llvm::all_of(Pattern, [](const auto &PatternChar) { + return !isalpha(PatternChar) || + llvm::is_contained(ValidDatePatternChars, PatternChar); + }); } // Checks if the string pattern used as a date format specifier contains diff --git a/clang-tools-extra/clangd/URI.cpp b/clang-tools-extra/clangd/URI.cpp --- a/clang-tools-extra/clangd/URI.cpp +++ b/clang-tools-extra/clangd/URI.cpp @@ -142,7 +142,7 @@ return false; if (!llvm::isAlpha(Scheme[0])) return false; - return std::all_of(Scheme.begin() + 1, Scheme.end(), [](char C) { + return llvm::all_of(llvm::drop_begin(Scheme), [](char C) { return llvm::isAlnum(C) || C == '+' || C == '.' || C == '-'; }); } diff --git a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp --- a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp +++ b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp @@ -777,12 +777,11 @@ llvm::sys::path::end(Path)) <= MaxSuffixComponents; })); // ... and precise. - assert(llvm::find_if(SystemHeaderMap->keys(), [](llvm::StringRef Path) { - return std::distance(llvm::sys::path::begin( - Path, llvm::sys::path::Style::posix), - llvm::sys::path::end(Path)) == - MaxSuffixComponents; - }) != SystemHeaderMap->keys().end()); + assert(llvm::any_of(SystemHeaderMap->keys(), [](llvm::StringRef Path) { + return std::distance( + llvm::sys::path::begin(Path, llvm::sys::path::Style::posix), + llvm::sys::path::end(Path)) == MaxSuffixComponents; + })); // FIXME: Suffix mapping contains invalid entries for C, so only enable it for // CPP. diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -757,12 +757,12 @@ return StartOffset.takeError(); if (!EndOffset) return EndOffset.takeError(); - if (llvm::find_if( + if (llvm::none_of( *MainFileRenameEdit, [&StartOffset, &EndOffset](const clang::tooling::Replacement &R) { return R.getOffset() == *StartOffset && R.getLength() == *EndOffset - *StartOffset; - }) == MainFileRenameEdit->end()) { + })) { return makeError(ReasonToReject::NoSymbolFound); } RenameResult Result; diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -71,6 +71,9 @@ - Fix `#57008 `_ - Builtin C++ language extension type traits instantiated by a template with unexpected number of arguments cause an assertion fault. +- Fix multi-level pack expansion of undeclared function parameters. + This fixes `Issue 56094 `_. + Improvements to Clang's diagnostics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -3061,10 +3061,6 @@ PREARGS_START + getNumPreArgs() + getNumArgs()); } - /// getNumCommas - Return the number of commas that must have been present in - /// this function call. - unsigned getNumCommas() const { return getNumArgs() ? getNumArgs() - 1 : 0; } - /// Get FPOptionsOverride from trailing storage. FPOptionsOverride getStoredFPFeatures() const { assert(hasStoredFPFeatures()); diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h --- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h @@ -348,10 +348,12 @@ /// Returns the `DeclContext` of the block being analysed, if any. Otherwise, /// returns null. - const DeclContext *getDeclCtx() { return DeclCtx; } + const DeclContext *getDeclCtx() { return CallStack.back(); } - /// Sets the `DeclContext` of the block being analysed. - void setDeclCtx(const DeclContext *Ctx) { DeclCtx = Ctx; } + /// Returns whether this `Environment` can be extended to analyze the given + /// `Callee` (i.e. if `pushCall` can be used), with recursion disallowed and a + /// given `MaxDepth`. + bool canDescend(unsigned MaxDepth, const DeclContext *Callee) const; /// Returns the `ControlFlowContext` registered for `F`, if any. Otherwise, /// returns null. @@ -390,7 +392,7 @@ DataflowAnalysisContext *DACtx; // `DeclContext` of the block being analysed if provided. - const DeclContext *DeclCtx = nullptr; + std::vector CallStack; // In a properly initialized `Environment`, `ReturnLoc` should only be null if // its `DeclContext` could not be cast to a `FunctionDecl`. diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h --- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h +++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h @@ -21,7 +21,11 @@ namespace clang { namespace dataflow { -struct ContextSensitiveOptions {}; +struct ContextSensitiveOptions { + /// The maximum depth to analyze. A value of zero is equivalent to disabling + /// context-sensitive analysis entirely. + unsigned Depth = 2; +}; struct TransferOptions { /// Options for analyzing function bodies when present in the translation diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3486,8 +3486,8 @@ bool FunctionDecl::hasOneParamOrDefaultArgs() const { return getNumParams() == 1 || (getNumParams() > 1 && - std::all_of(param_begin() + 1, param_end(), - [](ParmVarDecl *P) { return P->hasDefaultArg(); })); + llvm::all_of(llvm::drop_begin(parameters()), + [](ParmVarDecl *P) { return P->hasDefaultArg(); })); } /// The combination of the extern and inline keywords under MSVC forces diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -154,10 +154,10 @@ : DACtx(&DACtx), FlowConditionToken(&DACtx.makeFlowConditionToken()) {} Environment::Environment(const Environment &Other) - : DACtx(Other.DACtx), DeclCtx(Other.DeclCtx), ReturnLoc(Other.ReturnLoc), - ThisPointeeLoc(Other.ThisPointeeLoc), DeclToLoc(Other.DeclToLoc), - ExprToLoc(Other.ExprToLoc), LocToVal(Other.LocToVal), - MemberLocToStruct(Other.MemberLocToStruct), + : DACtx(Other.DACtx), CallStack(Other.CallStack), + ReturnLoc(Other.ReturnLoc), ThisPointeeLoc(Other.ThisPointeeLoc), + DeclToLoc(Other.DeclToLoc), ExprToLoc(Other.ExprToLoc), + LocToVal(Other.LocToVal), MemberLocToStruct(Other.MemberLocToStruct), FlowConditionToken(&DACtx->forkFlowCondition(*Other.FlowConditionToken)) { } @@ -168,11 +168,11 @@ } Environment::Environment(DataflowAnalysisContext &DACtx, - const DeclContext &DeclCtxArg) + const DeclContext &DeclCtx) : Environment(DACtx) { - setDeclCtx(&DeclCtxArg); + CallStack.push_back(&DeclCtx); - if (const auto *FuncDecl = dyn_cast(DeclCtx)) { + if (const auto *FuncDecl = dyn_cast(&DeclCtx)) { assert(FuncDecl->getBody() != nullptr); initGlobalVars(*FuncDecl->getBody(), *this); for (const auto *ParamDecl : FuncDecl->parameters()) { @@ -187,7 +187,7 @@ ReturnLoc = &createStorageLocation(ReturnType); } - if (const auto *MethodDecl = dyn_cast(DeclCtx)) { + if (const auto *MethodDecl = dyn_cast(&DeclCtx)) { auto *Parent = MethodDecl->getParent(); assert(Parent != nullptr); if (Parent->isLambda()) @@ -205,6 +205,13 @@ } } +bool Environment::canDescend(unsigned MaxDepth, + const DeclContext *Callee) const { + return CallStack.size() <= MaxDepth && + std::find(CallStack.begin(), CallStack.end(), Callee) == + CallStack.end(); +} + Environment Environment::pushCall(const CallExpr *Call) const { Environment Env(*this); @@ -239,7 +246,7 @@ void Environment::pushCallInternal(const FunctionDecl *FuncDecl, ArrayRef Args) { - setDeclCtx(FuncDecl); + CallStack.push_back(FuncDecl); // FIXME: In order to allow the callee to reference globals, we probably need // to call `initGlobalVars` here in some way. @@ -326,13 +333,13 @@ assert(DACtx == Other.DACtx); assert(ReturnLoc == Other.ReturnLoc); assert(ThisPointeeLoc == Other.ThisPointeeLoc); - assert(DeclCtx == Other.DeclCtx); + assert(CallStack == Other.CallStack); auto Effect = LatticeJoinEffect::Unchanged; Environment JoinedEnv(*DACtx); - JoinedEnv.setDeclCtx(DeclCtx); + JoinedEnv.CallStack = CallStack; JoinedEnv.ReturnLoc = ReturnLoc; JoinedEnv.ThisPointeeLoc = ThisPointeeLoc; diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp --- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp +++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp @@ -661,7 +661,8 @@ // `F` of `S`. The type `E` must be either `CallExpr` or `CXXConstructExpr`. template void transferInlineCall(const E *S, const FunctionDecl *F) { - if (!Options.ContextSensitiveOpts) + if (!(Options.ContextSensitiveOpts && + Env.canDescend(Options.ContextSensitiveOpts->Depth, F))) return; const ControlFlowContext *CFCtx = Env.getControlFlowContext(F); @@ -689,7 +690,7 @@ assert(CFCtx->getDecl() != nullptr && "ControlFlowContexts in the environment should always carry a decl"); auto Analysis = NoopAnalysis(CFCtx->getDecl()->getASTContext(), - DataflowAnalysisOptions()); + DataflowAnalysisOptions{Options}); auto BlockToOutputState = dataflow::runDataflowAnalysis(*CFCtx, Analysis, CalleeEnv); diff --git a/clang/lib/Analysis/ReachableCode.cpp b/clang/lib/Analysis/ReachableCode.cpp --- a/clang/lib/Analysis/ReachableCode.cpp +++ b/clang/lib/Analysis/ReachableCode.cpp @@ -299,6 +299,12 @@ if (isa(Term)) { return isConfigurationValue(Term, PP); } + // Do not treat constexpr if statement successors as unreachable in warnings + // since the point of these statements is to determine branches at compile + // time. + if (const auto *IS = dyn_cast(Term); + IS != nullptr && IS->isConstexpr()) + return true; } const Stmt *Cond = B->getTerminatorCondition(/* stripParens */ false); diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -332,8 +332,7 @@ return; // No special characters are allowed in CaretLine. - assert(CaretLine.end() == - llvm::find_if(CaretLine, [](char c) { return c < ' ' || '~' < c; })); + assert(llvm::none_of(CaretLine, [](char c) { return c < ' ' || '~' < c; })); // Find the slice that we need to display the full caret line // correctly. diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -3788,9 +3788,8 @@ // Variable is used if it has been marked as an array, array // section, array shaping or the variable iself. return StackComponents.size() == 1 || - std::all_of( - std::next(StackComponents.rbegin()), - StackComponents.rend(), + llvm::all_of( + llvm::drop_begin(llvm::reverse(StackComponents)), [](const OMPClauseMappableExprCommon:: MappableComponent &MC) { return MC.getAssociatedDeclaration() == diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -5792,6 +5792,7 @@ = dyn_cast(OldType)) { // We have a function parameter pack that may need to be expanded. QualType Pattern = Expansion->getPattern(); + NumExpansions = Expansion->getNumExpansions(); SmallVector Unexpanded; getSema().collectUnexpandedParameterPacks(Pattern, Unexpanded); diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp --- a/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp +++ b/clang/test/CXX/temp/temp.decls/temp.variadic/p5.cpp @@ -469,3 +469,25 @@ bar(b); } } + +namespace pr56094 { +template struct D { + template using B = int(int (*...p)(T, U)); + // expected-error@-1 {{pack expansion contains parameter pack 'U' that has a different length (1 vs. 2) from outer parameter packs}} + template D(B *); + // expected-note@-1 {{in instantiation of template type alias 'B' requested here}} +}; +using t1 = D::B; +// expected-note@-1 {{in instantiation of template class 'pr56094::D' requested here}} + +template struct F {}; +template struct G {}; +template struct E { + template using B = G...>; + // expected-error@-1 {{pack expansion contains parameter pack 'U' that has a different length (1 vs. 2) from outer parameter packs}} + template E(B *); + // expected-note@-1 {{in instantiation of template type alias 'B' requested here}} +}; +using t2 = E::B; +// expected-note@-1 {{in instantiation of template class 'pr56094::E' requested here}} +} // namespace pr56094 diff --git a/clang/test/CodeGenCXX/pragma-init_seg.cpp b/clang/test/CodeGenCXX/pragma-init_seg.cpp --- a/clang/test/CodeGenCXX/pragma-init_seg.cpp +++ b/clang/test/CodeGenCXX/pragma-init_seg.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -no-opaque-pointers %s -triple=i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 %s -triple=i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s int f(); @@ -10,12 +10,12 @@ #pragma init_seg(compiler) int x = f(); // CHECK: @"?x@simple_init@@3HA" = dso_local global i32 0, align 4 -// CHECK: @__cxx_init_fn_ptr = private constant void ()* @"??__Ex@simple_init@@YAXXZ", section ".CRT$XCC" +// CHECK: @__cxx_init_fn_ptr = private constant ptr @"??__Ex@simple_init@@YAXXZ", section ".CRT$XCC" #pragma init_seg(lib) int y = f(); // CHECK: @"?y@simple_init@@3HA" = dso_local global i32 0, align 4 -// CHECK: @__cxx_init_fn_ptr.1 = private constant void ()* @"??__Ey@simple_init@@YAXXZ", section ".CRT$XCL" +// CHECK: @__cxx_init_fn_ptr.1 = private constant ptr @"??__Ey@simple_init@@YAXXZ", section ".CRT$XCL" #pragma init_seg(user) int z = f(); @@ -29,14 +29,14 @@ namespace { int x = f(); // CHECK: @"?x@?A0x{{[^@]*}}@internal_init@@3HA" = internal global i32 0, align 4 -// CHECK: @__cxx_init_fn_ptr.2 = private constant void ()* @"??__Ex@?A0x{{[^@]*}}@internal_init@@YAXXZ", section ".asdf" +// CHECK: @__cxx_init_fn_ptr.2 = private constant ptr @"??__Ex@?A0x{{[^@]*}}@internal_init@@YAXXZ", section ".asdf" } } namespace selectany_init { int __declspec(selectany) x = f(); // CHECK: @"?x@selectany_init@@3HA" = weak_odr dso_local global i32 0, comdat, align 4 -// CHECK: @__cxx_init_fn_ptr.3 = private constant void ()* @"??__Ex@selectany_init@@YAXXZ", section ".asdf", comdat($"?x@selectany_init@@3HA") +// CHECK: @__cxx_init_fn_ptr.3 = private constant ptr @"??__Ex@selectany_init@@YAXXZ", section ".asdf", comdat($"?x@selectany_init@@3HA") } namespace explicit_template_instantiation { @@ -44,7 +44,7 @@ template const int A::x = f(); template struct A; // CHECK: @"?x@?$A@H@explicit_template_instantiation@@2HB" = weak_odr dso_local global i32 0, comdat, align 4 -// CHECK: @__cxx_init_fn_ptr.4 = private constant void ()* @"??__E?x@?$A@H@explicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@explicit_template_instantiation@@2HB") +// CHECK: @__cxx_init_fn_ptr.4 = private constant ptr @"??__E?x@?$A@H@explicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@explicit_template_instantiation@@2HB") } namespace implicit_template_instantiation { @@ -52,21 +52,21 @@ template const int A::x = f(); int g() { return A::x; } // CHECK: @"?x@?$A@H@implicit_template_instantiation@@2HB" = linkonce_odr dso_local global i32 0, comdat, align 4 -// CHECK: @__cxx_init_fn_ptr.5 = private constant void ()* @"??__E?x@?$A@H@implicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@implicit_template_instantiation@@2HB") +// CHECK: @__cxx_init_fn_ptr.5 = private constant ptr @"??__E?x@?$A@H@implicit_template_instantiation@@2HB@@YAXXZ", section ".asdf", comdat($"?x@?$A@H@implicit_template_instantiation@@2HB") } // ... and here's where we emitted user level ctors. -// CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] -// CHECK: [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_pragma_init_seg.cpp, i8* null }] +// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] +// CHECK: [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_pragma_init_seg.cpp, ptr null }] // We have to mark everything used so we can survive globalopt, even through // LTO. There's no way LLVM could really understand if data in the .asdf // section is really used or dead. // -// CHECK: @llvm.used = appending global [6 x i8*] -// CHECK: [i8* bitcast (void ()** @__cxx_init_fn_ptr to i8*), -// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.1 to i8*), -// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.2 to i8*), -// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.3 to i8*), -// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.4 to i8*), -// CHECK: i8* bitcast (void ()** @__cxx_init_fn_ptr.5 to i8*)], section "llvm.metadata" +// CHECK: @llvm.used = appending global [6 x ptr] +// CHECK: [ptr @__cxx_init_fn_ptr, +// CHECK: ptr @__cxx_init_fn_ptr.1, +// CHECK: ptr @__cxx_init_fn_ptr.2, +// CHECK: ptr @__cxx_init_fn_ptr.3, +// CHECK: ptr @__cxx_init_fn_ptr.4, +// CHECK: ptr @__cxx_init_fn_ptr.5], section "llvm.metadata" diff --git a/clang/test/Driver/avr-ld.c b/clang/test/Driver/avr-ld.c --- a/clang/test/Driver/avr-ld.c +++ b/clang/test/Driver/avr-ld.c @@ -1,44 +1,44 @@ -// RUN: %clang -### --target=avr -mmcu=at90s2313 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKA %s +// RUN: %clang -### --target=avr -mmcu=at90s2313 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKA %s // LINKA: {{".*ld.*"}} {{.*}} {{"-L.*tiny-stack"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lat90s2313" {{.*}} "--end-group" "-mavr2" -// RUN: %clang -### --target=avr -mmcu=at90s8515 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKB %s +// RUN: %clang -### --target=avr -mmcu=at90s8515 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKB %s // LINKB: {{".*ld.*"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lat90s8515" {{.*}} "--end-group" "-mavr2" -// RUN: %clang -### --target=avr -mmcu=attiny13 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKC %s +// RUN: %clang -### --target=avr -mmcu=attiny13 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKC %s // LINKC: {{".*ld.*"}} {{.*}} {{"-L.*avr25/tiny-stack"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lattiny13" {{.*}} "--end-group" "-mavr25" -// RUN: %clang -### --target=avr -mmcu=attiny44 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKD %s +// RUN: %clang -### --target=avr -mmcu=attiny44 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKD %s // LINKD: {{".*ld.*"}} {{.*}} {{"-L.*avr25"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-lattiny44" {{.*}} "--end-group" "-mavr25" -// RUN: %clang -### --target=avr -mmcu=atmega103 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKE %s +// RUN: %clang -### --target=avr -mmcu=atmega103 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKE %s // LINKE: {{".*ld.*"}} {{.*}} {{"-L.*avr31"}} {{.*}} "-Tdata=0x800060" "--start-group" {{.*}} "-latmega103" {{.*}} "--end-group" "-mavr31" -// RUN: %clang -### --target=avr -mmcu=atmega8u2 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKF %s +// RUN: %clang -### --target=avr -mmcu=atmega8u2 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKF %s // LINKF: {{".*ld.*"}} {{.*}} {{"-L.*avr35"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega8u2" {{.*}} "--end-group" "-mavr35" -// RUN: %clang -### --target=avr -mmcu=atmega48pa --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKG %s +// RUN: %clang -### --target=avr -mmcu=atmega48pa --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKG %s // LINKG: {{".*ld.*"}} {{.*}} {{"-L.*avr4"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega48pa" {{.*}} "--end-group" "-mavr4" -// RUN: %clang -### --target=avr -mmcu=atmega328 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKH %s +// RUN: %clang -### --target=avr -mmcu=atmega328 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKH %s // LINKH: {{".*ld.*"}} {{.*}} {{"-L.*avr5"}} {{.*}} "-Tdata=0x800100" "--start-group" {{.*}} "-latmega328" {{.*}} "--end-group" "-mavr5" -// RUN: %clang -### --target=avr -mmcu=atmega1281 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKI %s +// RUN: %clang -### --target=avr -mmcu=atmega1281 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKI %s // LINKI: {{".*ld.*"}} {{.*}} {{"-L.*avr51"}} {{.*}} "-Tdata=0x800200" "--start-group" {{.*}} "-latmega1281" {{.*}} "--end-group" "-mavr51" -// RUN: %clang -### --target=avr -mmcu=atmega2560 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKJ %s +// RUN: %clang -### --target=avr -mmcu=atmega2560 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKJ %s // LINKJ: {{".*ld.*"}} {{.*}} {{"-L.*avr6"}} {{.*}} "-Tdata=0x800200" "--start-group" {{.*}} "-latmega2560" {{.*}} "--end-group" "-mavr6" -// RUN: %clang -### --target=avr -mmcu=attiny10 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKK %s +// RUN: %clang -### --target=avr -mmcu=attiny10 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKK %s // LINKK: {{".*ld.*"}} {{.*}} {{"-L.*avrtiny"}} {{.*}} "-Tdata=0x800040" "--start-group" {{.*}} "-lattiny10" {{.*}} "--end-group" "-mavrtiny" -// RUN: %clang -### --target=avr -mmcu=atxmega16a4 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKL %s +// RUN: %clang -### --target=avr -mmcu=atxmega16a4 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKL %s // LINKL: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega2"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega16a4" {{.*}} "--end-group" "-mavrxmega2" -// RUN: %clang -### --target=avr -mmcu=atxmega64b3 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKM %s +// RUN: %clang -### --target=avr -mmcu=atxmega64b3 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKM %s // LINKM: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega4"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega64b3" {{.*}} "--end-group" "-mavrxmega4" -// RUN: %clang -### --target=avr -mmcu=atxmega128a3u --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKN %s +// RUN: %clang -### --target=avr -mmcu=atxmega128a3u --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKN %s // LINKN: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega6"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega128a3u" {{.*}} "--end-group" "-mavrxmega6" -// RUN: %clang -### --target=avr -mmcu=atxmega128a1 --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKO %s +// RUN: %clang -### --target=avr -mmcu=atxmega128a1 --rtlib=libgcc --sysroot %S/Inputs/basic_avr_tree %s 2>&1 | FileCheck -check-prefix LINKO %s // LINKO: {{".*ld.*"}} {{.*}} {{"-L.*avrxmega7"}} {{.*}} "-Tdata=0x802000" "--start-group" {{.*}} "-latxmega128a1" {{.*}} "--end-group" "-mavrxmega7" diff --git a/clang/test/Driver/avr-toolchain.c b/clang/test/Driver/avr-toolchain.c --- a/clang/test/Driver/avr-toolchain.c +++ b/clang/test/Driver/avr-toolchain.c @@ -73,7 +73,6 @@ // LDS1: "-T" "avr.lds" // LDS1-NOT: "-mavr5" -// RUN: %clang %s -### --target=avr -mmcu=atmega328 --sysroot=%S/Inputs/basic_avr_tree/ -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir 2>&1 | FileCheck --check-prefix=LIBGCC %s // RUN: %clang %s -### --target=avr -mmcu=atmega328 --sysroot=%S/Inputs/basic_avr_tree/ -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir --rtlib=libgcc 2>&1 | FileCheck --check-prefix=LIBGCC %s // LIBGCC: "-lgcc" // LIBGCC-NOT: libclang_rt diff --git a/clang/test/SemaCXX/unreachable-code.cpp b/clang/test/SemaCXX/unreachable-code.cpp --- a/clang/test/SemaCXX/unreachable-code.cpp +++ b/clang/test/SemaCXX/unreachable-code.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -Wunreachable-code-aggressive -fblocks -verify %s +// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fexceptions -fsyntax-only -Wunreachable-code-aggressive -fblocks -verify %s int j; int bar(); @@ -99,3 +99,34 @@ } } + +namespace gh57123 { + bool foo() { + if constexpr (true) { + if (true) + return true; + else + return false; // expected-warning {{will never be executed}} + } + else + return false; // no-warning + } + + bool bar() { + if (true) + return true; + else + return false; // expected-warning {{will never be executed}} + } + + bool baz() { + if constexpr (true) + return true; + else { + if (true) + return true; + else + return false; // expected-warning {{will never be executed}} + } + } +} diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -853,8 +853,8 @@ // `__start_` and `__stop_` symbols. bool isValidCIdentifier(StringRef S) { return !S.empty() && (isAlpha(S[0]) || S[0] == '_') && - std::all_of(S.begin() + 1, S.end(), - [](char C) { return C == '_' || isAlnum(C); }); + llvm::all_of(llvm::drop_begin(S), + [](char C) { return C == '_' || isAlnum(C); }); } Error linkBitcodeFiles(SmallVectorImpl &InputFiles, diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -3902,6 +3902,36 @@ {TransferOptions{/*.ContextSensitiveOpts=*/llvm::None}}); } +TEST(TransferTest, ContextSensitiveDepthZero) { + std::string Code = R"( + bool GiveBool(); + void SetBool(bool &Var) { Var = true; } + + void target() { + bool Foo = GiveBool(); + SetBool(Foo); + // [[p]] + } + )"; + runDataflow(Code, + [](llvm::ArrayRef< + std::pair>> + Results, + ASTContext &ASTCtx) { + ASSERT_THAT(Results, ElementsAre(Pair("p", _))); + const Environment &Env = Results[0].second.Env; + + const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo"); + ASSERT_THAT(FooDecl, NotNull()); + + auto &FooVal = + *cast(Env.getValue(*FooDecl, SkipPast::None)); + EXPECT_FALSE(Env.flowConditionImplies(FooVal)); + EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal))); + }, + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/0}}}); +} + TEST(TransferTest, ContextSensitiveSetTrue) { std::string Code = R"( bool GiveBool(); @@ -4000,7 +4030,7 @@ {TransferOptions{ContextSensitiveOptions{}}}); } -TEST(TransferTest, ContextSensitiveSetTwoLayers) { +TEST(TransferTest, ContextSensitiveSetTwoLayersDepthOne) { std::string Code = R"( bool GiveBool(); void SetBool1(bool &Var) { Var = true; } @@ -4028,7 +4058,146 @@ EXPECT_FALSE(Env.flowConditionImplies(FooVal)); EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal))); }, - {TransferOptions{ContextSensitiveOptions{}}}); + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/1}}}); +} + +TEST(TransferTest, ContextSensitiveSetTwoLayersDepthTwo) { + std::string Code = R"( + bool GiveBool(); + void SetBool1(bool &Var) { Var = true; } + void SetBool2(bool &Var) { SetBool1(Var); } + + void target() { + bool Foo = GiveBool(); + SetBool2(Foo); + // [[p]] + } + )"; + runDataflow(Code, + [](llvm::ArrayRef< + std::pair>> + Results, + ASTContext &ASTCtx) { + ASSERT_THAT(Results, ElementsAre(Pair("p", _))); + const Environment &Env = Results[0].second.Env; + + const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo"); + ASSERT_THAT(FooDecl, NotNull()); + + auto &FooVal = + *cast(Env.getValue(*FooDecl, SkipPast::None)); + EXPECT_TRUE(Env.flowConditionImplies(FooVal)); + }, + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/2}}}); +} + +TEST(TransferTest, ContextSensitiveSetThreeLayersDepthTwo) { + std::string Code = R"( + bool GiveBool(); + void SetBool1(bool &Var) { Var = true; } + void SetBool2(bool &Var) { SetBool1(Var); } + void SetBool3(bool &Var) { SetBool2(Var); } + + void target() { + bool Foo = GiveBool(); + SetBool3(Foo); + // [[p]] + } + )"; + runDataflow(Code, + [](llvm::ArrayRef< + std::pair>> + Results, + ASTContext &ASTCtx) { + ASSERT_THAT(Results, ElementsAre(Pair("p", _))); + const Environment &Env = Results[0].second.Env; + + const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo"); + ASSERT_THAT(FooDecl, NotNull()); + + auto &FooVal = + *cast(Env.getValue(*FooDecl, SkipPast::None)); + EXPECT_FALSE(Env.flowConditionImplies(FooVal)); + EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal))); + }, + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/2}}}); +} + +TEST(TransferTest, ContextSensitiveSetThreeLayersDepthThree) { + std::string Code = R"( + bool GiveBool(); + void SetBool1(bool &Var) { Var = true; } + void SetBool2(bool &Var) { SetBool1(Var); } + void SetBool3(bool &Var) { SetBool2(Var); } + + void target() { + bool Foo = GiveBool(); + SetBool3(Foo); + // [[p]] + } + )"; + runDataflow(Code, + [](llvm::ArrayRef< + std::pair>> + Results, + ASTContext &ASTCtx) { + ASSERT_THAT(Results, ElementsAre(Pair("p", _))); + const Environment &Env = Results[0].second.Env; + + const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo"); + ASSERT_THAT(FooDecl, NotNull()); + + auto &FooVal = + *cast(Env.getValue(*FooDecl, SkipPast::None)); + EXPECT_TRUE(Env.flowConditionImplies(FooVal)); + }, + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/3}}}); +} + +TEST(TransferTest, ContextSensitiveMutualRecursion) { + std::string Code = R"( + bool Pong(bool X, bool Y); + + bool Ping(bool X, bool Y) { + if (X) { + return Y; + } else { + return Pong(!X, Y); + } + } + + bool Pong(bool X, bool Y) { + if (Y) { + return X; + } else { + return Ping(X, !Y); + } + } + + void target() { + bool Foo = Ping(false, false); + // [[p]] + } + )"; + runDataflow(Code, + [](llvm::ArrayRef< + std::pair>> + Results, + ASTContext &ASTCtx) { + ASSERT_THAT(Results, ElementsAre(Pair("p", _))); + // The analysis doesn't crash... + const Environment &Env = Results[0].second.Env; + + const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo"); + ASSERT_THAT(FooDecl, NotNull()); + + auto &FooVal = + *cast(Env.getValue(*FooDecl, SkipPast::None)); + // ... but it also can't prove anything here. + EXPECT_FALSE(Env.flowConditionImplies(FooVal)); + EXPECT_FALSE(Env.flowConditionImplies(Env.makeNot(FooVal))); + }, + {TransferOptions{ContextSensitiveOptions{/*.Depth=*/4}}}); } TEST(TransferTest, ContextSensitiveSetMultipleLines) { diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -404,17 +404,14 @@ if (!groupInPedantic(Group)) continue; - unsigned ParentsInPedantic = 0; const std::vector &Parents = DiagGroupParents.getParents(Group); - for (unsigned j = 0, ej = Parents.size(); j != ej; ++j) { - if (groupInPedantic(Parents[j])) - ++ParentsInPedantic; - } + bool AllParentsInPedantic = + llvm::all_of(Parents, [&](Record *R) { return groupInPedantic(R); }); // If all the parents are in -Wpedantic, this means that this diagnostic // group will be indirectly included by -Wpedantic already. In that // case, do not add it directly to -Wpedantic. If the group has no // parents, obviously it should go into -Wpedantic. - if (Parents.size() > 0 && ParentsInPedantic == Parents.size()) + if (Parents.size() > 0 && AllParentsInPedantic) continue; if (RecordVec *V = GroupsInPedantic.dyn_cast()) diff --git a/compiler-rt/lib/msan/msan_report.cpp b/compiler-rt/lib/msan/msan_report.cpp --- a/compiler-rt/lib/msan/msan_report.cpp +++ b/compiler-rt/lib/msan/msan_report.cpp @@ -37,14 +37,14 @@ static void DescribeStackOrigin(const char *so, uptr pc) { Decorator d; Printf("%s", d.Origin()); - if (so == nullptr) { - Printf(" %sUninitialized value was created in the stack frame%s\n", - d.Origin(), d.Default()); - } else { + if (so) { Printf( " %sUninitialized value was created by an allocation of '%s%s%s'" " in the stack frame%s\n", d.Origin(), d.Name(), so, d.Origin(), d.Default()); + } else { + Printf(" %sUninitialized value was created in the stack frame%s\n", + d.Origin(), d.Default()); } if (pc) diff --git a/compiler-rt/lib/ubsan/CMakeLists.txt b/compiler-rt/lib/ubsan/CMakeLists.txt --- a/compiler-rt/lib/ubsan/CMakeLists.txt +++ b/compiler-rt/lib/ubsan/CMakeLists.txt @@ -192,7 +192,8 @@ add_compiler_rt_runtime(clang_rt.ubsan_standalone STATIC ARCHS ${UBSAN_SUPPORTED_ARCH} - SOURCES ubsan_init_standalone_preinit.cpp + SOURCES + ubsan_init_standalone_preinit.cpp ADDITIONAL_HEADERS ${UBSAN_HEADERS} OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake --- a/flang/cmake/modules/AddFlang.cmake +++ b/flang/cmake/modules/AddFlang.cmake @@ -18,7 +18,7 @@ macro(add_flang_library name) cmake_parse_arguments(ARG - "SHARED;STATIC" + "SHARED;STATIC;INSTALL_WITH_TOOLCHAIN" "" "ADDITIONAL_HEADERS" ${ARGN}) @@ -65,7 +65,8 @@ if (TARGET ${name}) - if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libflang") + if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "libflang" + OR ARG_INSTALL_WITH_TOOLCHAIN) get_target_export_arg(${name} Flang export_to_flangtargets UMBRELLA flang-libraries) install(TARGETS ${name} COMPONENT ${name} diff --git a/flang/docs/PolymorphicEntities.md b/flang/docs/PolymorphicEntities.md new file mode 100644 --- /dev/null +++ b/flang/docs/PolymorphicEntities.md @@ -0,0 +1,871 @@ +# Polymorphic Entities + +A polymorphic entity is a data entity that can be of different type during the +execution of a program. + +This document aims to give insights at the representation of polymorphic +entities in FIR and how polymorphic related constructs and features are lowered +to FIR. + +## Fortran standard + +Here is a list of the sections and constraints of the Fortran standard involved +for polymorphic entities. + +- 7.3.2.1 - 7.3.2.2: TYPE specifier (TYPE(*)) + - C708 + - C709 + - C710 + - C711 +- 7.3.2.3: CLASS specifier +- 7.5.4.5: The passed-object dummy argument + - C760 +- 9.7.1: ALLOCATE statement + - C933 +- 9.7.2: NULLIFY statement + - When a NULLIFY statement is applied to a polymorphic pointer (7.3.2.3), + its dynamic type becomes the same as its declared type. +- 10.2.2.3: Data pointer assignment +- 11.1.3: ASSOCIATE construct +- 11.1.11: SELECT TYPE construct + - C1157 + - C1158 + - C1159 + - C1160 + - C1161 + - C1162 + - C1163 + - C1164 + - C1165 +- 16.9.76 EXTENDS_TYPE_OF (A, MOLD) +- 16.9.165 SAME_TYPE_AS (A, B) +- 16.9.184 STORAGE_SIZE (A [, KIND]) +- C.10.5 Polymorphic Argument Association (15.5.2.9) + +--- + +## Representation in FIR + +### Polymorphic entities `CLASS(type1)` + +A polymorphic entity is represented as a class type in FIR. In the example below +the dummy argument `p` is passed to the subroutine `foo` as a polymorphic entity +with the extensible type `point`. The type information captured in the class is +the best statically available at compile time. +`!fir.class` is a new type introduced for polymorphic entities. It's similar to +a box type but allows the distinction between a monomorphic and a polymorphic +descriptor. +A specific `BoxTypeInterface` (TypeInterface) can be introduced to share the +same API for both types where it is necessary. `!fir.class` and `!fir.box` can +also be based on a same `BaseBoxType` similar to the `BaseMemRefType` done for +MemRef. + +**Fortran** +```fortran +type point + real :: x, y +end type point + +type, extends(point) :: point_3d + real :: z +end type + +subroutine foo(p) + class(point) :: p + ! code of the subroutine +end subroutine +``` + +**FIR** +```c +func.func @foo(%p : !fir.class>) +``` + +### Unlimited polymorphic entities `CLASS(*)` + +The unlimited polymorphic entity is represented as a class type with `*`. + +**Fortran** +```fortran +subroutine bar(x) + class(*) :: x + ! code of the subroutine +end subroutine +``` + +**FIR** +```c +func.func @bar(%x : !fir.class<*>) +``` + +### Assumed-type `TYPE(*)` + +Assumed type is added in Fortran 2018 and it is available only for dummy +arguments. It's mainly used for interfaces to non-Fortran code and is similar +to C's `void`. It's not part of polymorphic entities directly but it's not +currently implemented in flang. + +Assumed-type is represented as `!fir.type<*>`. + +### SELECT TYPE construct + +The `SELECT TYPE` construct select for execution at most one of its constituent +block. The selection is based on the dynamic type of the selector. + +**Fortran** +```fortran +type point + real :: x, y +end type point +type, extends(point) :: point_3d + real :: z +end type point_3d +type, extends(point) :: color_point + integer :: color +end type color_point + +type(point), target :: p +type(point_3d), target :: p3 +type(color_point), target :: c +class(point), pointer :: p_or_c +p_or_c => c +select type ( a => p_or_c ) +class is (point) + print*, a%x, a%y +type is (point_3d) + print*, a%x, a%y, a%z +class default + print*, +end select +``` + +From the Fortran standard: +> A `TYPE IS` type guard statement matches the selector if the dynamic type +and kind type parameter values of the selector are the same as those specified +by the statement. A `CLASS IS` type guard statement matches the selector if the +dynamic type of the selector is an extension of the type specified by the +statement and the kind type parameter values specified by the statement are the +same as the corresponding type parameter values of the dynamic type of the +selector. + +In the example above the `CLASS IS` type guard is matched. + +The construct is lowered to a specific FIR operation `fir.select_type`. It is +similar to other FIR "select" operations such as `fir.select` and +`fir.select_rank`. The dynamic type of the selector value is matched against a +list of type descriptor. The `TYPE IS` type guard statement is represented by a +`#fir.type_is` attribute and the `CLASS IS` type guard statement is represented +by a `#fir.class_is` attribute. +The `CLASS DEFAULT` type guard statement is represented by a `unit` attribute. + +**FIR** +``` +fir.select_type %p : !fir.class> [ + #fir.class_is>, ^bb1, + #fir.type_is>, ^bb2, + unit, ^bb3] +``` + +Lowering of the `fir.select_type` operation will produce a if-then-else ladder. +The testing of the dynamic type of the selector is done by calling runtime +functions. + +The runtime has two functions to compare dynamic types . Note that this two +functions _ignore_ the values of `KIND` type parameters. A version of these +functions that does not _ignore_ the value of the `KIND` type parameters will +be implemented for the `SELECT TYPE` type guards testing. + +Currently available functions for the `EXTENDS_TYPE_OF` and `SAME_TYPE_AS` +intrinsics (`flang/include/flang/Evaluate/type.h`). +```cpp +std::optional ExtendsTypeOf(const DynamicType &) const; +std::optional SameTypeAs(const DynamicType &) const; +``` + +**FIR** (lower level FIR/MLIR after conversion to an if-then-else ladder) +``` +module { + func @f(%arg0: !fir.class<*>) -> i32 { + %c4_i32 = arith.constant 4 : i32 + %c8_i32 = arith.constant 8 : i32 + %c16_i32 = arith.constant 16 : i32 + %0 = fir.gentypedesc !fir.tdesc>> + %1 = fir.convert %arg0 : (!fir.class>) -> !fir.box + %2 = fir.convert %0 : (!fir.tdesc>>) -> !fir.ref + %3 = fir.call @ExtendsTypeOfWithKind(%1, %2) : (!fir.box, !fir.ref) -> i1 + cond_br %3, ^bb2(%c4_i32 : i32), ^bb1 + ^bb1: // pred: ^bb0 + %4 = fir.gentypedesc !fir.type<_QTpoint_3d{x:f32,y:f32,z:f32}> + %5 = fir.convert %arg0 : (!fir.class>) -> !fir.box + %6 = fir.convert %4 : (!fir.tdesc>) -> !fir.ref + %7 = fir.call @SameTypeAsWithKind(%5, %6) : (!fir.box, !fir.ref) -> i1 + cond_br %7, ^bb4(%c16_i32 : i32), ^bb3 + ^bb2(%8: i32): // pred: ^bb0 + return %8 : i32 + ^bb3: // pred: ^bb1 + br ^bb5(%c8_i32 : i32) + ^bb4(%9: i32): // pred: ^bb1 + %10 = arith.addi %9, %9 : i32 + return %10 : i32 + ^bb5(%11: i32): // pred: ^bb3 + %12 = arith.muli %11, %11 : i32 + return %12 : i32 + } + func private @ExactSameTypeAsWithKind(!fir.box, !fir.ref) -> i1 + func private @SameTypeAsWithKind(!fir.box, !fir.ref) -> i1 +} +``` + +Note: some dynamic type checks can be inlined for performance. Type check with +intrinsic types when dealing with unlimited polymorphic entities is an ideal +candidate for inlined checks. + +--- + +## Dynamic dispatch + +Dynamic dispatch is the process of selecting which implementation of a +polymorphic procedure to call at runtime. The runtime already has information +to be used in this process (more information can be found here: +[RuntimeTypeInfo.md](RuntimeTypeInfo.md)). + +The declaration of the data structures are present in +`flang/runtime/type-info.h`. + +In the example below, there is a basic type `shape` with two type extensions +`triangle` and `rectangle`. +The two type extensions override the `get_area` type-bound procedure. + +**UML** +``` + + |---------------------| + | Shape | + |---------------------| + | + color:integer | + | + isFilled:logical | + |---------------------| + | + init() | + | + get_area():real | + |---------------------| + /\ + /__\ + | + |---------------------------------------------------| + | | + | | +|---------------------| |---------------------| +| triangle | | rectangle | +|---------------------| |---------------------| +| + base:real | | + length:real | +| + height:real | | + width:real | +|---------------------| |---------------------| +| + get_area():real | | + get_area():real | +|---------------------| |---------------------| + +``` + +**Fortran** +```fortran +module geometry +type :: shape + integer :: color + logical :: isFilled +contains + procedure :: get_area => get_area_shape + procedure :: init => init_shape +end type shape + +type, extends(shape) :: triangle + real :: base + real :: height +contains + procedure :: get_area => get_area_triangle +end type triangle + +type, extends(shape) :: rectangle + real :: length + real :: width +contains + procedure :: get_area => get_area_rectangle +end type rectangle + +type shape_array + class(shape), allocatable :: item +end type + +contains + +function get_area_shape(this) + real :: get_area_shape + class(shape) :: this + get_area_shape = 0.0 +end function + +subroutine init_shape(this, color) + class(shape) :: this + integer :: color + this%color = color + this%isFilled = .false. +end subroutine + +function get_area_triangle(this) + real :: get_area_triangle + class(triangle) :: this + get_area_triangle = (this%base * this%height) / 2 +end function + +function get_area_rectangle(this) + real :: get_area_rectangle + class(rectangle) :: this + get_area_rectangle = this%length * this%width +end function + +function get_all_area(shapes) + real :: get_all_area + type(shape_array) :: shapes(:) + real :: sum + integer :: i + + get_all_area = 0.0 + + do i = 1, size(shapes) + get_all_area = get_all_area + shapes(i)%item%get_area() + end do +end function + +subroutine set_base_values(sh, v1, v2) + class(shape) :: sh + real, intent(in) :: v1, v2 + + select type (sh) + type is (triangle) + sh%base = v1 + sh%height = v2 + type is (rectangle) + sh%length = v1 + sh%width = v2 + class default + print*,'Cannot set values' + end select +end subroutine + +end module + +program foo + use geometry + + real :: area + + type(shape_array), dimension(2) :: shapes + + allocate (triangle::shapes(1)%item) + allocate (rectangle::shapes(2)%item) + + do i = 1, size(shapes) + call shapes(i)%item%init(i) + end do + + call set_base_values(shapes(1)%item, 2.0, 1.5) + call set_base_values(shapes(2)%item, 5.0, 4.5) + + area = get_all_area(shapes) + + print*, area + + deallocate(shapes(1)%item) + deallocate(shapes(2)%item) +end program +``` + +The `fir.dispatch` operation is used to perform a dynamic dispatch. This +operation is comparable to the `fir.call` operation but for polymorphic +entities. +Call to `NON_OVERRIDABLE` type-bound procedure are resolved at compile time and +a `fir.call` operation is emitted instead of a `fir.dispatch`. +When the type of a polymorphic entity can be fully determined at compile +time, a `fir.dispatch` op can even be converted to a `fir.call` op. This will +be discussed in more detailed later in the document in the devirtualization +section. + +**FIR** +Here is simple example of the `fir.dispatch` operation. The operation specify +the binding name of the type-bound procedure to be called and pass the +descriptor as argument. If the `NOPASS` attribute is set then the descriptor is +not passed as argument when lowered. If `PASS(arg-name)` is specified, the +`fir.pass` attribute is added to point to the PASS argument in the +`fir.dispatch` operation. `fir.nopass` attribute is added for the `NOPASS`. The +descriptor still need to be present in the `fir.dispatch` operation for the +dynamic dispatch. The CodeGen will then omit the descriptor in the argument +of the generated call. + +The dispatch explanation focus only on the call to `get_area()` as seen in the +example. + +**Fortran** +```fortran +get_all_area = get_all_area + shapes(i)%item%get_area() +``` + +**FIR** +```c +%1 = fir.convert %0 : (!fir.ref,base:f32,height:f32>>>) -> !fir.ref> +%2 = fir.dispatch "get_area"(%1) : (!fir.ref>) -> f32 +``` + +The type information is stored in the `f18Addendum` of the descriptor. The +format is defined in `flang/runtime/type-info.h` and part of its representation +in LLVM IR is shown below. The binding is comparable to a vtable. Each derived +type has a complete type-bound procedure table in which all of the bindings of +its ancestor types appear first. + +**LLVMIR** + +Representation of the derived type information with the bindings. +```c +%_QM__fortran_type_infoTderivedtype = type { { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8 }, i64, { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i32, i8, i8, i8, i8, [4 x i8] } +%_QM__fortran_type_infoTbinding = type { %_QM__fortran_builtinsT__builtin_c_funptr, { ptr, i64, i32, i8, i8, i8, i8 } } +%_QM__fortran_builtinsT__builtin_c_funptr = type { i64 } +``` + +The `fir.dispatch` is then lowered to use the runtime information to extract the +correct function from the vtable and to perform the actual call. Here is +what it can look like in pseudo LLVM IR code. + +**LLVMIR** +```c +// Retrieve the bindings (vtable) from the type information from the descriptor +%1 = call %_QM__fortran_type_infoTbinding* @_FortranAGetBindings(%desc) +// Retrieve the position of the specific bindings in the table +%2 = call i32 @_FortranAGetBindingOffset(%1, "get_area") +// Get the binding from the table +%3 = getelementptr %_QM__fortran_type_infoTbinding, %_QM__fortran_type_infoTbinding* %1, i32 0, i32 %2 +// Get the function pointer from the binding +%4 = getelementptr %_QM__fortran_builtinsT__builtin_c_funptr, %_QM__fortran_type_infoTbinding %3, i32 0, i32 0 +// Cast func pointer +%5 = inttoptr i64 %4 to +// Load the function +%6 = load f32(%_QMgeometryTshape*)*, %5 +// Perform the actual function call +%7 = call f32 %6(%_QMgeometryTshape* %shape) +``` + +_Note:_ functions `@_FortranAGetBindings` and `@_FortranAGetBindingOffset` are +not available in the runtime and will need to be implemented. + +- `@_FortranAGetBindings` retrieves the bindings from the descriptor. The + descriptor holds the type information that holds the bindings. +- `@_FortranAGetBindingOffset` retrieves the procedure offset in the bindings + based on the binding name provided. + +Retrieving the binding table and the offset are done separately so multiple +dynamic dispatch on the same polymorphic entities can be optimized (the binding +table is retrieved only once for multiple call). + +### Passing polymorphic entities as argument + +**Fortran** +```fortran +TYPE t1 +END TYPE +TYPE, EXTENDS(t1) :: t2 +END TYPE +``` + +1) Dummy argument is fixed type and actual argument is fixed type. + - `TYPE(t1)` to `TYPE(t1)`: Nothing special to take into consideration. +2) Dummy argument is polymorphic and actual argument is fixed type. In these + cases, the actual argument need to be boxed to be passed to the + subroutine/function since those are expecting a descriptor. + ```c + func.func @_QMmod1Ps(%arg0: !fir.class>) + func.func @_QQmain() { + %0 = fir.alloca !fir.type<_QMmod1Tshape{x:i32,y:i32}> {uniq_name = "_QFEsh"} + %1 = fir.embox %0 : (!fir.ref>) -> !fir.class> + fir.call @_QMmod1Ps(%1) : (!fir.class>) -> () + return + } + ``` + - `TYPE(t1)` to `CLASS(t1)` + - `TYPE(t2)` to `CLASS(t1)` + - `TYPE(t1)` to `CLASS(t2)` - Invalid + - `TYPE(t2)` to `CLASS(t2)` +3) Actual argument is polymorphic and dummy argument is fixed type. These case + are restricted to the declared type of the polymorphic entities. + - The simple case is when the actual argument is a scalar + polymorphic entity passed to a non-PDT. The caller just extract the + base address from the descriptor and pass it to the function. + - In other cases, the caller needs to perform a copyin/copyout since it + cannot just extract the base address of the `CLASS(T)` because it is + likely not contiguous. + - `CLASS(t1)` to `TYPE(t1)` + - `CLASS(t2)` to `TYPE(t1)` - Invalid + - `CLASS(t1)` to `TYPE(t2)` - Invalid + - `CLASS(t2)` to `TYPE(t2)` +4) Both actual and dummy arguments are polymorphic. These particular cases are + straight forward. The function expect polymorphic entities already. + The boxed type is passed without change. + - `CLASS(t1)` to `CLASS(t1)` + - `CLASS(t2)` to `CLASS(t1)` + - `CLASS(t1)` to `CLASS(t2)` - Invalid + - `CLASS(t2)` to `CLASS(t2)` + +### User-Defined Derived Type Input/Output + +User-Defined Derived Type Input/Output allows to define how a derived-type +is read or written from/to a file. + +There are 4 basic subroutines that can be defined: +- Formatted READ +- Formatted WRITE +- Unformatted READ +- Unformatted WRITE + +Here are their respective interfaces: + +**Fortran** +```fortran +subroutine read_formatted(dtv, unit, iotype, v_list, iostat, iomsg) +subroutine write_formatted(dtv, unit, iotype, v_list, iostat, iomsg) +subroutine read_unformatted(dtv, unit, iotype, v_list, iostat, iomsg) +subroutine write_unformatted(dtv, unit, iotype, v_list, iostat, iomsg) +``` + +When defined on a derived-type, these specific type-bound procedures are stored +as special bindings in the type descriptor (see `SpecialBinding` in +`flang/runtime/type-info.h`). + +With a derived-type the function call to `@_FortranAioOutputDescriptor` from IO +runtime will be emitted in lowering. + +**Fortran** +```fortran +type(t) :: x +write(10), x +``` + +**FIR** +```c +%5 = fir.call @_FortranAioBeginUnformattedOutput(%c10_i32, %4, %c56_i32) : (i32, !fir.ref, i32) -> !fir.ref +%6 = fir.embox %2 : (!fir.ref>) -> !fir.class> +%7 = fir.convert %6 : (!fir.class>) -> !fir.box +%8 = fir.call @_FortranAioOutputDescriptor(%5, %7) : (!fir.ref, !fir.box) -> i1 +%9 = fir.call @_FortranAioEndIoStatement(%5) : (!fir.ref) -> i32 +``` + +When dealing with polymorphic entities the call to IO runtime can stay +unchanged. The runtime function `OutputDescriptor` can make the dynamic dispatch +to the correct binding stored in the descriptor. + +### Finalization + +The `FINAL` specifies a final subroutine that might be executed when a data +entity of that type is finalized. Section 7.5.6.3 defines when finalization +occurs. + +Final subroutines like User-Defined Derived Type Input/Output are stored as +special bindings in the type descriptor. The runtime is able to handle the +finalization with a call the the `@_FortranADestroy` function +(`flang/include/flang/Runtime/derived-api.h`). + +**FIR** +```c +%5 = fir.call @_FortranADestroy(%desc) : (!fir.box) -> none +``` + +The `@_FortranADestroy` function will take care to call the final subroutines +and the ones from the parent type. + +Appropriate call to finalization have to be lowered at the right places (7.5.6.3 +When finalization occurs). + +### Devirtualization + +Sometimes there is enough information at compile-time to avoid going through +a dynamic dispatch for a type-bound procedure call on a polymorphic entity. To +be able to perform this optimization directly in FIR the dispatch table is also +present statically with the `fir.dispatch_table` and `fir.dt_entry` operations. + +Here is an example of these operations representing the dispatch tables for the +same example than for the dynamic dispatch. + +**FIR** +``` +fir.dispatch_table @_QMgeometryE.dt.shape { + fir.dt_entry init, @_QMgeometryPinit_shape + fir.dt_entry get_area, @_QMgeometryPget_area_shape +} + +fir.dispatch_table @_QMgeometryE.dt.rectangle { + fir.dt_entry init, @_QMgeometryPinit_shape + fir.dt_entry get_area, @_QMgeometryPget_area_rectangle +} + +fir.dispatch_table @_QMgeometryE.dt.triangle { + fir.dt_entry init, @_QMgeometryPinit_shape + fir.dt_entry get_area, @_QMgeometryPget_area_triangle +} +``` + +With this information, an optimization pass can replace `fir.dispatch` +operations with `fir.call` operations to the correct functions when the type is +know at compile time. + +This is the case in a `type is` type-guard block as illustrated below. + +**Fortran** +```fortran +subroutine get_only_triangle_area(sh) + class(shape) :: sh + real :: area + + select type (sh) + type is (triangle) + area = sh%get_area() + class default + area = 0.0 + end select + +end subroutine +``` + +**FIR** + +The call to `get_area` in the `type is (triangle)` guard can be replaced. +```c +%3 = fir.dispatch "get_area"(%desc) +// Replaced by +%3 = fir.call @get_area_triangle(%desc) +``` + +Another example would be the one below. In this case as well, a dynamic dispatch +is not necessary and a `fir.call` can be emitted instead. + +**Fortran** +```fortran +real :: area +class(shape), pointer :: sh +type(triangle), target :: tr + +sh => tr + +area = sh%get_area() +``` + +Note that the frontend is already replacing some of the dynamic dispatch calls +with the correct static ones. The optimization pass is useful for cases not +handled by the frontend and especially cases showing up after some other +optimizations are applied. + +### `ALLOCATE`/`DEALLOCATE` statements + +The allocation and deallocation of polymorphic entities are delegated to the +runtime. +The corresponding function signatures can be found in +`flang/include/flang/Runtime/allocatable.h` and in +`flang/include/flang/Runtime/pointer.h` for pointer allocation. + +`ALLOCATE` + +The `ALLOCATE` statement is lowered to runtime calls as shown in the example +below. + +**Fortran** +```fortran +allocate(triangle::shapes(1)%item) +allocate(rectangle::shapes(2)%item) +``` + +**FIR** +```c +%0 = fir.alloca !fir.class,base:f32,height:f32>> +%1 = fir.alloca !fir.class,base:f32,height:f32}>> +%3 = fir.convert %0 : (!fir.ref,base:f32,height:f32>>>) -> !fir.ref> +%4 = fir.gentypedesc !fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>> +%5 = fir.call @_FortranAAllocatableInitDerived(%3, %4) + +%6 = fir.convert %1 : (!fir.ref,base:f32,height:f32}>>>) -> !fir.ref> +%7 = fir.gentypedesc !fir.type<_QMgeometryTtriangle{color:i32,isFilled:!fir.logical<4>,base:f32,height:f32}>> %8 = fir.call @_FortranAAllocatableInitDerived(%6, %7) +``` + +For pointer allocation, the `PointerAllocate` function is used. + +`DEALLOCATE` + +The `DEALLOCATE` statement is lowered to a runtime call to +`AllocatableDeallocate` and `PointerDeallocate` for pointers. + +**Fortran** +```fortran +deallocate(shapes(1)%item) +deallocate(shapes(2)%item) +``` + +**FIR** +```c +%8 = fir.call @_FortranAAllocatableDeallocate(%desc1) +%9 = fir.call @_FortranAAllocatableDeallocate(%desc2) +``` + +### `EXTENDS_TYPE_OF`/`SAME_TYPE_AS` intrinsics + +`EXTENDS_TYPE_OF` and `SAME_TYPE_AS` intrinsics have implementation in the +runtime. Respectively `SameTypeAs` and `ExtendsTypeOf` in +`flang/include/flang/Evaluate/type.h`. + +Both intrinsic functions are lowered to their respective runtime calls. + +### Assignment / Pointer assignment + +Intrinsic assignment of an object to another is already implemented in the +runtime. The function `@_FortranAAsssign` performs the correct operations. + +Available in `flang/include/flang/Runtime/assign.h`. + +### User defined assignment and operator + +**Fortran** +```fortran +module mod1 +type t1 +contains + procedure :: assign_t1 + generic :: assignment(=) => assign_t1 +end type t1 + +type, extends(t1) :: t2 +end type + +contains + +subroutine assign_t1(to, from) + class(t1), intent(inout) :: to + class(t1), intent(in) :: from + ! Custom code for the assignment +end subroutine + +subroutine assign_t2(to, from) + class(t2), intent(inout) :: to + class(t2), intent(in) :: from + ! Custom code for the assignment +end subroutine + +end module + +program main +use mod + +class(t1), allocatable :: v1 +class(t1), allocatable :: v2 + +allocate(t2::v1) +allocate(t2::v2) + +v2 = v1 + +end program +``` + +In the example above the assignment `v2 = v1` is done by a call to `assign_t1`. +This is resolved at compile time since `t2` could not have a generic type-bound +procedure for assignment with an interface that is not distinguishable. This +is the same for user defined operators. + +### `NULLIFY` + +When a `NULLIFY` statement is applied to a polymorphic pointer (7.3.2.3), its +dynamic type becomes the same as its declared type. + +The `NULLIFY` statement is lowered to a call to the corresponding runtime +function `PointerNullifyDerived` in `flang/include/flang/Runtime/pointer.h`. + +### Impact on existing FIR operations dealing with descriptors + +Currently, FIR has a couple of operations taking descriptors as inputs or +producing descriptors as outputs. These operations might need to deal with the +dynamic type of polymorphic entities. + +- `fir.load`/`fir.store` + - Currently a `fir.load` of a `fir.box` is a special case. In the code + generation no copy is made. This could be problematic with polymorphic + entities. When a `fir.load` is performed on a `fir.class` type, the dynamic + can be copied. + + **Fortran** + ```fortran + module mod1 + class(shape), pointer :: a + contains + subroutine sub1(a, b) + class(shape) :: b + associate (b => a) + ! Some more code + end associate + end subroutine + end module + ``` + + In the example above, the dynamic type of `a` and `b` might be different. The + dynamic type of `a` must be copied when it is associated on `b`. + + **FIR** + ```c + // fir.load must copy the dynamic type from the pointer `a` + %0 = fir.address_of(@_QMmod1Ea) : !fir.ref>>> + %1 = fir.load %0 : !fir.ref>>> + ``` + +- `fir.embox` + - The embox operation is used to create a descriptor from a reference. With + polymorphic entities, it is used to create a polymorphic descriptor from + a derived type. The declared type of the descriptor and the derived type + are identical. The dynamic type of the descriptor must be set when it is + created. This is already handled by lowering. + +- `fir.rebox` + - The rebox operation is used to create a new descriptor from a another + descriptor with new optional dimension. If the original descriptor is a + polymorphic entities its dynamic type must be propagated to the new + descriptor. + ``` + %0 = fir.slice %c10, %c33, %c2 : (index, index, index) -> !fir.slice<1> + %1 = fir.shift %c0 : (index) -> !fir.shift<1> + %2 = fir.rebox %x(%1)[%0] : (!fir.class>>, !fir.shift<1>, !fir.slice<1>) -> !fir.class>> + ``` +--- + +# Testing + +- Lowering part is tested with LIT tests in tree +- Polymorphic entities involved a lot of runtime information so executable + tests will be useful for full testing. + +--- + +# Current TODOs +Current list of TODOs in lowering: +- `flang/lib/Lower/Allocatable.cpp:465` not yet implemented: SOURCE allocation +- `flang/lib/Lower/Allocatable.cpp:468` not yet implemented: MOLD allocation +- `flang/lib/Lower/Allocatable.cpp:471` not yet implemented: polymorphic entity allocation +- `flang/lib/Lower/Bridge.cpp:448` not yet implemented: create polymorphic host associated copy +- `flang/lib/Lower/Bridge.cpp:2185` not yet implemented: assignment to polymorphic allocatable +- `flang/lib/Lower/Bridge.cpp:2288` not yet implemented: pointer assignment involving polymorphic entity +- `flang/lib/Lower/Bridge.cpp:2316` not yet implemented: pointer assignment involving polymorphic entity +- `flang/lib/Lower/CallInterface.cpp:795` not yet implemented: support for polymorphic types +- `flang/lib/Lower/ConvertType.cpp:237` not yet implemented: support for polymorphic types + +Current list of TODOs in code generation: + +- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:897` not yet implemented: fir.dispatch codegen +- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:911` not yet implemented: fir.dispatch_table codegen +- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:924` not yet implemented: fir.dt_entry codegen +- `flang/lib/Optimizer/CodeGen/CodeGen.cpp:2651` not yet implemented: fir.gentypedesc codegen + +--- + +Resources: +- [1] https://www.pgroup.com/blogs/posts/f03-oop-part1.htm +- [2] https://www.pgroup.com/blogs/posts/f03-oop-part2.htm +- [3] https://www.pgroup.com/blogs/posts/f03-oop-part3.htm +- [4] https://www.pgroup.com/blogs/posts/f03-oop-part4.htm +- [5] Modern Fortran explained diff --git a/flang/include/flang/Common/idioms.h b/flang/include/flang/Common/idioms.h --- a/flang/include/flang/Common/idioms.h +++ b/flang/include/flang/Common/idioms.h @@ -123,6 +123,9 @@ const std::size_t value; }; +template +ListItemCount(std::initializer_list) -> ListItemCount; + #define ENUM_CLASS(NAME, ...) \ enum class NAME { __VA_ARGS__ }; \ LLVM_ATTRIBUTE_UNUSED static constexpr std::size_t NAME##_enumSize{[] { \ diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt --- a/flang/lib/Decimal/CMakeLists.txt +++ b/flang/lib/Decimal/CMakeLists.txt @@ -1,5 +1,5 @@ -add_flang_library(FortranDecimal +add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN binary-to-decimal.cpp decimal-to-binary.cpp ) diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp --- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp @@ -31,11 +31,14 @@ #include "flang/Optimizer/Support/FIRContext.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/Optional.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "flang-simplify-intrinsics" @@ -159,8 +162,13 @@ /// with signature provided by \p funcOp. The caller is responsible /// for saving/restoring the original insertion point of \p builder. /// \p funcOp is expected to be empty on entry to this function. +/// \p arg1ElementTy and \p arg2ElementTy specify elements types +/// of the underlying array objects - they are used to generate proper +/// element accesses. static void genFortranADotBody(fir::FirOpBuilder &builder, - mlir::func::FuncOp &funcOp) { + mlir::func::FuncOp &funcOp, + mlir::Type arg1ElementTy, + mlir::Type arg2ElementTy) { // function FortranADotProduct_simplified(arr1, arr2) // T, dimension(:) :: arr1, arr2 // T product = 0 @@ -171,14 +179,15 @@ // FortranADotProduct_simplified = product // end function FortranADotProduct_simplified auto loc = mlir::UnknownLoc::get(builder.getContext()); - mlir::Type elementType = funcOp.getResultTypes()[0]; + mlir::Type resultElementType = funcOp.getResultTypes()[0]; builder.setInsertionPointToEnd(funcOp.addEntryBlock()); mlir::IndexType idxTy = builder.getIndexType(); - mlir::Value zero = elementType.isa() - ? builder.createRealConstant(loc, elementType, 0.0) - : builder.createIntegerConstant(loc, elementType, 0); + mlir::Value zero = + resultElementType.isa() + ? builder.createRealConstant(loc, resultElementType, 0.0) + : builder.createIntegerConstant(loc, resultElementType, 0); mlir::Block::BlockArgListType args = funcOp.front().getArguments(); mlir::Value arg1 = args[0]; @@ -187,10 +196,12 @@ mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0); fir::SequenceType::Shape flatShape = {fir::SequenceType::getUnknownExtent()}; - mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType); - mlir::Type boxArrTy = fir::BoxType::get(arrTy); - mlir::Value array1 = builder.create(loc, boxArrTy, arg1); - mlir::Value array2 = builder.create(loc, boxArrTy, arg2); + mlir::Type arrTy1 = fir::SequenceType::get(flatShape, arg1ElementTy); + mlir::Type boxArrTy1 = fir::BoxType::get(arrTy1); + mlir::Value array1 = builder.create(loc, boxArrTy1, arg1); + mlir::Type arrTy2 = fir::SequenceType::get(flatShape, arg2ElementTy); + mlir::Type boxArrTy2 = fir::BoxType::get(arrTy2); + mlir::Value array2 = builder.create(loc, boxArrTy2, arg2); // This version takes the loop trip count from the first argument. // If the first argument's box has unknown (at compilation time) // extent, then it may be better to take the extent from the second @@ -216,19 +227,25 @@ mlir::OpBuilder::InsertPoint loopEndPt = builder.saveInsertionPoint(); builder.setInsertionPointToStart(loop.getBody()); - mlir::Type eleRefTy = builder.getRefType(elementType); + mlir::Type eleRef1Ty = builder.getRefType(arg1ElementTy); mlir::Value index = loop.getInductionVar(); mlir::Value addr1 = - builder.create(loc, eleRefTy, array1, index); + builder.create(loc, eleRef1Ty, array1, index); mlir::Value elem1 = builder.create(loc, addr1); + // Convert to the result type. + elem1 = builder.create(loc, resultElementType, elem1); + + mlir::Type eleRef2Ty = builder.getRefType(arg2ElementTy); mlir::Value addr2 = - builder.create(loc, eleRefTy, array2, index); + builder.create(loc, eleRef2Ty, array2, index); mlir::Value elem2 = builder.create(loc, addr2); + // Convert to the result type. + elem2 = builder.create(loc, resultElementType, elem2); - if (elementType.isa()) + if (resultElementType.isa()) sumVal = builder.create( loc, builder.create(loc, elem1, elem2), sumVal); - else if (elementType.isa()) + else if (resultElementType.isa()) sumVal = builder.create( loc, builder.create(loc, elem1, elem2), sumVal); else @@ -317,6 +334,29 @@ return 0; } +/// Given the call operation's box argument \p val, discover +/// the element type of the underlying array object. +/// \returns the element type or llvm::None if the type cannot +/// be reliably found. +/// We expect that the argument is a result of fir.convert +/// with the destination type of !fir.box. +static llvm::Optional getArgElementType(mlir::Value val) { + mlir::Operation *defOp; + do { + defOp = val.getDefiningOp(); + // Analyze only sequences of convert operations. + if (!mlir::isa(defOp)) + return llvm::None; + val = defOp->getOperand(0); + // The convert operation is expected to convert from one + // box type to another box type. + auto boxType = val.getType().cast(); + auto elementType = fir::unwrapSeqOrBoxedSeqType(boxType); + if (!elementType.isa()) + return elementType; + } while (true); +} + void SimplifyIntrinsicsPass::runOnOperation() { LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); mlir::ModuleOp module = getOperation(); @@ -380,11 +420,42 @@ if (!type.isa() && !type.isa()) return; + // Try to find the element types of the boxed arguments. + auto arg1Type = getArgElementType(v1); + auto arg2Type = getArgElementType(v2); + + if (!arg1Type || !arg2Type) + return; + + // Support only floating point and integer arguments + // now (e.g. logical is skipped here). + if (!arg1Type->isa() && + !arg1Type->isa()) + return; + if (!arg2Type->isa() && + !arg2Type->isa()) + return; + auto typeGenerator = [&type](fir::FirOpBuilder &builder) { return genFortranADotType(builder, type); }; + auto bodyGenerator = [&arg1Type, + &arg2Type](fir::FirOpBuilder &builder, + mlir::func::FuncOp &funcOp) { + genFortranADotBody(builder, funcOp, *arg1Type, *arg2Type); + }; + + // Suffix the function name with the element types + // of the arguments. + std::string typedFuncName(funcName); + llvm::raw_string_ostream nameOS(typedFuncName); + nameOS << "_"; + arg1Type->print(nameOS); + nameOS << "_"; + arg2Type->print(nameOS); + mlir::func::FuncOp newFunc = getOrCreateFunction( - builder, funcName, typeGenerator, genFortranADotBody); + builder, typedFuncName, typeGenerator, bodyGenerator); auto newCall = builder.create(loc, newFunc, mlir::ValueRange{v1, v2}); call->replaceAllUsesWith(newCall.getResults()); diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -88,4 +88,6 @@ LINK_LIBS FortranDecimal + + INSTALL_WITH_TOOLCHAIN ) diff --git a/flang/runtime/FortranMain/CMakeLists.txt b/flang/runtime/FortranMain/CMakeLists.txt --- a/flang/runtime/FortranMain/CMakeLists.txt +++ b/flang/runtime/FortranMain/CMakeLists.txt @@ -1,3 +1,3 @@ -add_flang_library(Fortran_main STATIC +add_flang_library(Fortran_main STATIC INSTALL_WITH_TOOLCHAIN Fortran_main.c ) diff --git a/flang/test/Lower/OpenACC/acc-data-operands.f90 b/flang/test/Lower/OpenACC/acc-data-operands.f90 --- a/flang/test/Lower/OpenACC/acc-data-operands.f90 +++ b/flang/test/Lower/OpenACC/acc-data-operands.f90 @@ -113,12 +113,88 @@ end subroutine -subroutine acc_operand_array_section2(a) - real, dimension(100) :: a +! Testing array sections on allocatable array +subroutine acc_operand_array_section_allocatable() + real, allocatable :: a(:) + + allocate(a(100)) + + !$acc data copyin(a(1:50)) copyout(a(51:100)) + !$acc end data + + !CHECK: %[[ARR_HEAP:.*]] = fir.alloca !fir.heap> {uniq_name = "_QMacc_data_operandFacc_operand_array_section_allocatableEa.addr"} + + !CHECK: %[[LOAD_ARR0:.*]] = fir.load %[[ARR_HEAP]] : !fir.ref>> + !CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32 + !CHECK: %[[C1_I64:.*]] = fir.convert %[[C1_I32]] : (i32) -> i64 + !CHECK: %[[LB0:.*]] = fir.convert %[[C1_I64]] : (i64) -> index + !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64 + !CHECK: %[[STEP0:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index + !CHECK: %[[C50_I32:.*]] = arith.constant 50 : i32 + !CHECK: %[[C50_I64:.*]] = fir.convert %[[C50_I32]] : (i32) -> i64 + !CHECK: %[[UB0:.*]] = fir.convert %[[C50_I64]] : (i64) -> index + !CHECK: %[[SHAPE_SHIFT0:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1> + !CHECK: %[[SLICE0:.*]] = fir.slice %[[LB0]], %[[UB0]], %[[STEP0]] : (index, index, index) -> !fir.slice<1> + !CHECK: %[[ARR_SECTION0:.*]] = fir.embox %[[LOAD_ARR0]](%[[SHAPE_SHIFT0]]) [%[[SLICE0]]] : (!fir.heap>, !fir.shapeshift<1>, !fir.slice<1>) -> !fir.box> + !CHECK: %[[MEM0:.*]] = fir.alloca !fir.box> + !CHECK: fir.store %[[ARR_SECTION0]] to %[[MEM0]] : !fir.ref>> + + !CHECK: %[[LOAD_ARR1:.*]] = fir.load %[[ARR_HEAP]] : !fir.ref>> + !CHECK: %[[C51_I32:.*]] = arith.constant 51 : i32 + !CHECK: %[[C51_I64:.*]] = fir.convert %[[C51_I32]] : (i32) -> i64 + !CHECK: %[[LB1:.*]] = fir.convert %[[C51_I64]] : (i64) -> index + !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64 + !CHECK: %[[STEP1:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index + !CHECK: %[[C100_I32:.*]] = arith.constant 100 : i32 + !CHECK: %[[C100_I64:.*]] = fir.convert %[[C100_I32]] : (i32) -> i64 + !CHECK: %[[UB1:.*]] = fir.convert %[[C100_I64]] : (i64) -> index + !CHECK: %[[SHAPE_SHIFT1:.*]] = fir.shape_shift %{{.*}}, %{{.*}} : (index, index) -> !fir.shapeshift<1> + !CHECK: %[[SLICE1:.*]] = fir.slice %[[LB1]], %[[UB1]], %[[STEP1]] : (index, index, index) -> !fir.slice<1> + !CHECK: %[[ARR_SECTION1:.*]] = fir.embox %[[LOAD_ARR1]](%[[SHAPE_SHIFT1]]) [%[[SLICE1]]] : (!fir.heap>, !fir.shapeshift<1>, !fir.slice<1>) -> !fir.box> + !CHECK: %[[MEM1:.*]] = fir.alloca !fir.box> + !CHECK: fir.store %[[ARR_SECTION1]] to %[[MEM1]] : !fir.ref>> + + !CHECK: acc.data copyin(%[[MEM0]] : !fir.ref>>) copyout(%[[MEM1]] : !fir.ref>>) + + deallocate(a) +end subroutine - !$acc data copyin(a) + +! Testing array sections on pointer array +subroutine acc_operand_array_section_pointer() + real, target :: a(100) + real, pointer :: p(:) + + p => a + + !$acc data copyin(p(1:50)) !$acc end data + !CHECK: %[[C100:.*]] = arith.constant 100 : index + !CHECK: %[[ARR:.*]] = fir.alloca !fir.array<100xf32> {bindc_name = "a", fir.target, uniq_name = "_QMacc_data_operandFacc_operand_array_section_pointerEa"} + !CHECK: %[[PTR:.*]] = fir.alloca !fir.box>> {bindc_name = "p", uniq_name = "_QMacc_data_operandFacc_operand_array_section_pointerEp"} + !CHECK: %[[SHAPE0:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1> + !CHECK: %[[EMBOX0:.*]] = fir.embox %[[ARR]](%[[SHAPE0]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box>> + !CHECK: fir.store %[[EMBOX0]] to %[[PTR]] : !fir.ref>>> + !CHECK: %[[PTR_LOAD:.*]] = fir.load %[[PTR]] : !fir.ref>>> + !CHECK: %[[C0:.*]] = arith.constant 0 : index + !CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PTR_LOAD]], %[[C0]] : (!fir.box>>, index) -> (index, index, index) + !CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32 + !CHECK: %[[C1_I64:.*]] = fir.convert %[[C1_I32]] : (i32) -> i64 + !CHECK: %[[LB0:.*]] = fir.convert %[[C1_I64]] : (i64) -> index + !CHECK: %[[C1_STEP:.*]] = arith.constant 1 : i64 + !CHECK: %[[STEP0:.*]] = fir.convert %[[C1_STEP]] : (i64) -> index + !CHECK: %[[C50_I32:.*]] = arith.constant 50 : i32 + !CHECK: %[[C50_I64:.*]] = fir.convert %[[C50_I32]] : (i32) -> i64 + !CHECK: %[[UB0:.*]] = fir.convert %[[C50_I64]] : (i64) -> index + !CHECK: %[[SHIFT0:.*]] = fir.shift %[[BOX_DIMS]]#0 : (index) -> !fir.shift<1> + !CHECK: %[[SLICE0:.*]] = fir.slice %[[LB0]], %[[UB0]], %[[STEP0]] : (index, index, index) -> !fir.slice<1> + !CHECK: %[[REBOX0:.*]] = fir.rebox %7(%[[SHIFT0]]) [%[[SLICE0]]] : (!fir.box>>, !fir.shift<1>, !fir.slice<1>) -> !fir.box> + !CHECK: %[[MEM0:.*]] = fir.alloca !fir.box> + !CHECK: fir.store %[[REBOX0]] to %[[MEM0]] : !fir.ref>> + + !CHECK: acc.data copyin(%[[MEM0]] : !fir.ref>>) { + end subroutine end module diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir --- a/flang/test/Transforms/simplifyintrinsics.fir +++ b/flang/test/Transforms/simplifyintrinsics.fir @@ -344,15 +344,15 @@ // CHECK: %[[RESLOC:.*]] = fir.alloca f32 {bindc_name = "dot", uniq_name = "_QFdotEdot"} // CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box>) -> !fir.box // CHECK: %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box>) -> !fir.box -// CHECK: %[[RES:.*]] = fir.call @_FortranADotProductReal4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box, !fir.box) -> f32 +// CHECK: %[[RES:.*]] = fir.call @_FortranADotProductReal4_f32_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box, !fir.box) -> f32 // CHECK: fir.store %[[RES]] to %[[RESLOC]] : !fir.ref // CHECK: %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref // CHECK: return %[[RET]] : f32 // CHECK: } -// CHECK-LABEL: func.func private @_FortranADotProductReal4_simplified( -// CHECK-SAME: %[[A:.*]]: !fir.box, -// CHECK-SAME: %[[B:.*]]: !fir.box) -> f32 attributes {llvm.linkage = #llvm.linkage} { +// CHECK-LABEL: func.func private @_FortranADotProductReal4_f32_f32_simplified( +// CHECK-SAME: %[[A:.*]]: !fir.box, +// CHECK-SAME: %[[B:.*]]: !fir.box) -> f32 attributes {llvm.linkage = #llvm.linkage} { // CHECK: %[[FZERO:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[IZERO:.*]] = arith.constant 0 : index // CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box) -> !fir.box> @@ -363,9 +363,11 @@ // CHECK: %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f32) { // CHECK: %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref +// CHECK: %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f32) -> f32 // CHECK: %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref -// CHECK: %[[MUL:.*]] = arith.mulf %[[AVAL]], %[[BVAL]] : f32 +// CHECK: %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f32 +// CHECK: %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f32 // CHECK: %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f32 // CHECK: fir.result %[[NEWSUM]] : f32 // CHECK: } @@ -479,15 +481,15 @@ // CHECK: %[[RESLOC:.*]] = fir.alloca i32 {bindc_name = "dot", uniq_name = "_QFdotEdot"} // CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box>) -> !fir.box // CHECK: %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box>) -> !fir.box -// CHECK: %[[RES:.*]] = fir.call @_FortranADotProductInteger4_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box, !fir.box) -> i32 +// CHECK: %[[RES:.*]] = fir.call @_FortranADotProductInteger4_i32_i32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box, !fir.box) -> i32 // CHECK: fir.store %[[RES]] to %[[RESLOC]] : !fir.ref // CHECK: %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref // CHECK: return %[[RET]] : i32 // CHECK: } -// CHECK-LABEL: func.func private @_FortranADotProductInteger4_simplified( -// CHECK-SAME: %[[A:.*]]: !fir.box, -// CHECK-SAME: %[[B:.*]]: !fir.box) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// CHECK-LABEL: func.func private @_FortranADotProductInteger4_i32_i32_simplified( +// CHECK-SAME: %[[A:.*]]: !fir.box, +// CHECK-SAME: %[[B:.*]]: !fir.box) -> i32 attributes {llvm.linkage = #llvm.linkage} { // CHECK: %[[I32ZERO:.*]] = arith.constant 0 : i32 // CHECK: %[[IZERO:.*]] = arith.constant 0 : index // CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box) -> !fir.box> @@ -498,9 +500,11 @@ // CHECK: %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[I32ZERO]]) -> (i32) { // CHECK: %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref +// CHECK: %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (i32) -> i32 // CHECK: %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref // CHECK: %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref -// CHECK: %[[MUL:.*]] = arith.muli %[[AVAL]], %[[BVAL]] : i32 +// CHECK: %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (i32) -> i32 +// CHECK: %[[MUL:.*]] = arith.muli %[[AVALCAST]], %[[BVALCAST]] : i32 // CHECK: %[[NEWSUM:.*]] = arith.addi %[[MUL]], %[[SUM]] : i32 // CHECK: fir.result %[[NEWSUM]] : i32 // CHECK: } @@ -587,3 +591,63 @@ // CHECK-SAME: %[[A:.*]]: !fir.box> {fir.bindc_name = "a"}, // CHECK-SAME: %[[B:.*]]: !fir.box> {fir.bindc_name = "b"}) -> i64 { // CHECK-NOT: call{{.*}}_FortranADotProductInteger8( + +// ----- + +// Test mixed types, e.g. when _FortranADotProductReal8 is called +// with and arguments. The loaded elements must be converted +// to the result type REAL(8) before the computations. + +func.func @dot_f64_f32(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.box> {fir.bindc_name = "b"}) -> f64 { + %0 = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"} + %1 = fir.address_of(@_QQcl.2E2F646F742E66393000) : !fir.ref> + %c3_i32 = arith.constant 3 : i32 + %2 = fir.convert %arg0 : (!fir.box>) -> !fir.box + %3 = fir.convert %arg1 : (!fir.box>) -> !fir.box + %4 = fir.convert %1 : (!fir.ref>) -> !fir.ref + %5 = fir.call @_FortranADotProductReal8(%2, %3, %4, %c3_i32) : (!fir.box, !fir.box, !fir.ref, i32) -> f64 + fir.store %5 to %0 : !fir.ref + %6 = fir.load %0 : !fir.ref + return %6 : f64 +} +func.func private @_FortranADotProductReal4(!fir.box, !fir.box, !fir.ref, i32) -> f32 attributes {fir.runtime} +fir.global linkonce @_QQcl.2E2F646F742E66393000 constant : !fir.char<1,10> { + %0 = fir.string_lit "./dot.f90\00"(10) : !fir.char<1,10> + fir.has_value %0 : !fir.char<1,10> +} + +// CHECK-LABEL: func.func @dot_f64_f32( +// CHECK-SAME: %[[A:.*]]: !fir.box> {fir.bindc_name = "a"}, +// CHECK-SAME: %[[B:.*]]: !fir.box> {fir.bindc_name = "b"}) -> f64 { +// CHECK: %[[RESLOC:.*]] = fir.alloca f64 {bindc_name = "dot", uniq_name = "_QFdotEdot"} +// CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box>) -> !fir.box +// CHECK: %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box>) -> !fir.box +// CHECK: %[[RES:.*]] = fir.call @_FortranADotProductReal8_f64_f32_simplified(%[[ACAST]], %[[BCAST]]) : (!fir.box, !fir.box) -> f64 +// CHECK: fir.store %[[RES]] to %[[RESLOC]] : !fir.ref +// CHECK: %[[RET:.*]] = fir.load %[[RESLOC]] : !fir.ref +// CHECK: return %[[RET]] : f64 +// CHECK: } + +// CHECK-LABEL: func.func private @_FortranADotProductReal8_f64_f32_simplified( +// CHECK-SAME: %[[A:.*]]: !fir.box, +// CHECK-SAME: %[[B:.*]]: !fir.box) -> f64 attributes {llvm.linkage = #llvm.linkage} { +// CHECK: %[[FZERO:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK: %[[IZERO:.*]] = arith.constant 0 : index +// CHECK: %[[ACAST:.*]] = fir.convert %[[A]] : (!fir.box) -> !fir.box> +// CHECK: %[[BCAST:.*]] = fir.convert %[[B]] : (!fir.box) -> !fir.box> +// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ACAST]], %[[IZERO]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[IONE:.*]] = arith.constant 1 : index +// CHECK: %[[LEN:.*]] = arith.subi %[[DIMS]]#1, %[[IONE]] : index +// CHECK: %[[RES:.*]] = fir.do_loop %[[IDX:.*]] = %[[IZERO]] to %[[LEN]] step %[[IONE]] iter_args(%[[SUM:.*]] = %[[FZERO]]) -> (f64) { +// CHECK: %[[ALOC:.*]] = fir.coordinate_of %[[ACAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref +// CHECK: %[[AVAL:.*]] = fir.load %[[ALOC]] : !fir.ref +// CHECK: %[[AVALCAST:.*]] = fir.convert %[[AVAL]] : (f64) -> f64 +// CHECK: %[[BLOC:.*]] = fir.coordinate_of %[[BCAST]], %[[IDX]] : (!fir.box>, index) -> !fir.ref +// CHECK: %[[BVAL:.*]] = fir.load %[[BLOC]] : !fir.ref +// CHECK: %[[BVALCAST:.*]] = fir.convert %[[BVAL]] : (f32) -> f64 +// CHECK: %[[MUL:.*]] = arith.mulf %[[AVALCAST]], %[[BVALCAST]] : f64 +// CHECK: %[[NEWSUM:.*]] = arith.addf %[[MUL]], %[[SUM]] : f64 +// CHECK: fir.result %[[NEWSUM]] : f64 +// CHECK: } +// CHECK: return %[[RES]] : f64 +// CHECK: } diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -91,6 +91,10 @@ libc.src.stdlib.realloc libc.src.stdlib.free + # stdio.h entrypoints + libc.src.stdio.sprintf + libc.src.stdio.snprintf + # sys/stat.h entrypoints libc.src.sys.stat.mkdir libc.src.sys.stat.mkdirat @@ -242,8 +246,6 @@ libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.fwrite_unlocked - libc.src.stdio.sprintf - libc.src.stdio.snprintf libc.src.stdio.fprintf libc.src.stdio.printf libc.src.stdio.stderr diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -92,6 +92,10 @@ libc.src.stdlib.aligned_alloc libc.src.stdlib.free + # stdio.h entrypoints + libc.src.stdio.sprintf + libc.src.stdio.snprintf + # sys/mman.h entrypoints libc.src.sys.mman.mmap libc.src.sys.mman.munmap @@ -298,8 +302,6 @@ libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.fwrite_unlocked - libc.src.stdio.sprintf - libc.src.stdio.snprintf libc.src.stdio.fprintf libc.src.stdio.printf libc.src.stdio.stderr diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(math) add_subdirectory(string) add_subdirectory(stdlib) +add_subdirectory(stdio) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(dirent) @@ -24,6 +25,5 @@ # since assert uses the signal API, we disable assert also. # add_subdirectory(assert) # add_subdirectory(signal) -add_subdirectory(stdio) add_subdirectory(threads) add_subdirectory(time) diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt --- a/libc/src/stdio/printf_core/CMakeLists.txt +++ b/libc/src/stdio/printf_core/CMakeLists.txt @@ -31,17 +31,6 @@ .core_structs ) -add_object_library( - file_writer - SRCS - file_writer.cpp - HDRS - file_writer.h - DEPENDS - libc.src.__support.File.file - .core_structs -) - add_object_library( writer SRCS @@ -91,6 +80,23 @@ libc.src.__support.arg_list ) +if(NOT (TARGET libc.src.__support.File.file)) + # Not all platforms have a file implementation. If file is unvailable, + # then we must skip all file based printf sections. + return() +endif() + +add_object_library( + file_writer + SRCS + file_writer.cpp + HDRS + file_writer.h + DEPENDS + libc.src.__support.File.file + .core_structs +) + add_object_library( vfprintf_internal SRCS diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -34,6 +34,7 @@ add_subdirectory(math) add_subdirectory(string) add_subdirectory(stdlib) +add_subdirectory(stdio) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(fcntl) @@ -50,7 +51,6 @@ # since assert uses the signal API, we disable assert also. # add_subdirectory(assert) # add_subdirectory(signal) -add_subdirectory(stdio) add_subdirectory(time) if(${LIBC_TARGET_OS} STREQUAL "linux") diff --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp --- a/libc/test/src/stdio/printf_core/parser_test.cpp +++ b/libc/test/src/stdio/printf_core/parser_test.cpp @@ -191,7 +191,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithLongLengthModifier) { __llvm_libc::printf_core::FormatSection format_arr[10]; const char *str = "%lld"; - int arg1 = 12345; + long long arg1 = 12345; evaluate(format_arr, str, arg1); __llvm_libc::printf_core::FormatSection expected; @@ -208,7 +208,7 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithAllOptions) { __llvm_libc::printf_core::FormatSection format_arr[10]; const char *str = "% -056.78jd"; - int arg1 = 12345; + intmax_t arg1 = 12345; evaluate(format_arr, str, arg1); __llvm_libc::printf_core::FormatSection expected; diff --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt --- a/libc/utils/UnitTest/CMakeLists.txt +++ b/libc/utils/UnitTest/CMakeLists.txt @@ -54,13 +54,6 @@ libc.src.__support.CPP.array_ref ) -if(NOT LLVM_LIBC_FULL_BUILD) # TODO(michaelrj): make a more permanant solution. - return() -endif() - -#currently stdio is fullbuild only, so this matcher that depends on a piece of -#printf also has to be fullbuild only. - add_library( LibcPrintfHelpers PrintfMatcher.h diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -592,6 +592,7 @@ -Wno-user-defined-literals -Wno-covered-switch-default -Wno-suggest-override + -Wno-ctad-maybe-unsupported ) if (LIBCXX_TARGETING_CLANG_CL) target_add_compile_flags_if_supported(${target} PRIVATE diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv --- a/libcxx/docs/Status/SpaceshipProjects.csv +++ b/libcxx/docs/Status/SpaceshipProjects.csv @@ -15,9 +15,9 @@ | `[type.info] `_,| `typeinfo `_,None,Adrian Vogelsgesang,|Complete| | `[coroutine.handle.compare] `_,| `coroutine_handle `_,[comparisons.three.way],Chuanqi Xu,|Complete| | `[pairs.spec] `_,| `pair `_,[expos.only.func],Kent Ross,|Complete| -| `[syserr.errcat.nonvirtuals] `_,| `error_category `_,[comparisons.three.way],Adrian Vogelsgesang,|In Progress| +| `[syserr.errcat.nonvirtuals] `_,| `error_category `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| | `[syserr.compare] `_,"| `error_code `_ -| `error_condition `_",None,Adrian Vogelsgesang,|In Progress| +| `error_condition `_",None,Adrian Vogelsgesang,|Complete| | `[tuple.rel] `_,| `tuple `_,[expos.only.func],Kent Ross,|Complete| "| `[optional.relops] `_ | `[optional.nullops] `_ @@ -29,31 +29,31 @@ | `[unique.ptr.special] `_,| `unique_ptr `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| | `[util.smartptr.shared.cmp] `_,| `shared_ptr `_,[comparisons.three.way],Adrian Vogelsgesang,|Complete| | `[type.index.members] `_,| `type_index `_,None,Adrian Vogelsgesang,|Complete| -| `[charconv.syn] `_,| to_chars_result,None,Mark de Wever,|Complete| -| `[charconv.syn] `_,| from_chars_result,None,Mark de Wever,|Complete| +| `[charconv.syn] `_,| `to_chars_result `_,None,Mark de Wever,|Complete| +| `[charconv.syn] `_,| `from_chars_result `_,None,Mark de Wever,|Complete| | `[stacktrace.entry.cmp] `_,| stacktrace_entry,None,Unassigned,|Not Started| | `[stacktrace.basic.cmp] `_,| basic_stacktrace,[alg.three.way],Unassigned,|Not Started| -| `[string.cmp] `_,| `basic_string `,None,Mark de Wever,|Complete| +| `[string.cmp] `_,| `basic_string `_,None,Mark de Wever,|Complete| | `[string.view.comparison] `_,| `basic_string_view `_,None,Mark de Wever,|Complete| -| `[array.syn] `_ (`general `_),| array,[expos.only.func],Unassigned,|Not Started| -| `[deque.syn] `_ (`general `_),| deque,[expos.only.func],Unassigned,|Not Started| -| `[forward.list.syn] `_ (`general `_),| forward_list,[expos.only.func],Unassigned,|Not Started| -| `[list.syn] `_ (`general `_),| list,[expos.only.func],Unassigned,|Not Started| -| `[vector.syn] `_ (`general `_),| vector,[expos.only.func],Unassigned,|Not Started| -| `[associative.map.syn] `_ (`general `_),"| map +| `[array.syn] `_ (`general `_),| array,[expos.only.func],Unassigned,|Not Started| +| `[deque.syn] `_ (`general `_),| deque,[expos.only.func],Unassigned,|Not Started| +| `[forward.list.syn] `_ (`general `_),| forward_list,[expos.only.func],Unassigned,|Not Started| +| `[list.syn] `_ (`general `_),| list,[expos.only.func],Unassigned,|Not Started| +| `[vector.syn] `_ (`general `_),| vector,[expos.only.func],Unassigned,|Not Started| +| `[associative.map.syn] `_ (`general `_),"| map | multimap",[expos.only.func],Unassigned,|Not Started| -| `[associative.set.syn] `_ (`general `_),"| multiset +| `[associative.set.syn] `_ (`general `_),"| multiset | set",[expos.only.func],Unassigned,|Not Started| | `[queue.ops] `_,| queue,None,Unassigned,|Not Started| | `[stack.ops] `_,| stack,None,Unassigned,|Not Started| -| `[reverse.iter.cmp] `_,| reverse_iterator,None,Mikhail Maltsev,|Complete| +| `[reverse.iter.cmp] `_,| `reverse_iterator `_,None,Mikhail Maltsev,|Complete| | `[move.iter.op.comp] `_,| move_iterator,None,Unassigned,|Not Started| | `[counted.iter.cmp] `_,| counted_iterator,None,Unassigned,|Not Started| | `[range.iota.iterator] `_,| `ranges::iota_view::iterator `_,[concepts.cmp],Arthur O'Dwyer,|Complete| | `[range.transform.iterator] `_,| `ranges::transform_view::iterator `_,[concepts.cmp],Arthur O'Dwyer,|Complete| | `[range.elements.iterator] `_,| ranges::elements_view::iterator,[concepts.cmp],Unassigned,|Not Started| | `[time.duration.comparisons] `_, "chrono::duration", None, Mark de Wever, |Not Started| -| `[time.point.comparisons] `_, "chrono::point", None, Mark de Wever, |Not Started| +| `[time.point.comparisons] `_, "chrono::time_point", None, Mark de Wever, |Not Started| "| `[time.cal.day.nonmembers] `_ | `[time.cal.month.nonmembers] `_ | `[time.cal.year.nonmembers] `_ @@ -61,14 +61,14 @@ | `[time.cal.mdlast] `_ | `[time.cal.ym.nonmembers] `_ | `[time.cal.ymd.nonmembers] `_ -| `[time.cal.ymdlast.nonmembers] `_","| chrono::day -| chrono::month -| chrono::year -| chrono::month_day -| chrono::month_day_last -| chrono::year_month -| chrono::year_month_day -| chrono::year_month_day_last",None,Mark de Wever,|Complete| +| `[time.cal.ymdlast.nonmembers] `_","| `chrono::day `_ +| `chrono::month `_ +| `chrono::year `_ +| `chrono::month_day `_ +| `chrono::month_day_last `_ +| `chrono::year_month `_ +| `chrono::year_month_day `_ +| `chrono::year_month_day_last `_",None,Mark de Wever,|Complete| "| `[time.zone.nonmembers] `_ | `[time.zone.leap.nonmembers] `_ | `[time.zone.link.nonmembers] `_","| chrono::time_zone diff --git a/libcxx/include/system_error b/libcxx/include/system_error --- a/libcxx/include/system_error +++ b/libcxx/include/system_error @@ -32,8 +32,9 @@ virtual string message(int ev) const = 0; bool operator==(const error_category& rhs) const noexcept; - bool operator!=(const error_category& rhs) const noexcept; - bool operator<(const error_category& rhs) const noexcept; + bool operator!=(const error_category& rhs) const noexcept; // removed in C++20 + bool operator<(const error_category& rhs) const noexcept; // removed in C++20 + strong_ordering operator<=>(const error_category& rhs) const noexcept; // C++20 }; const error_category& generic_category() noexcept; @@ -75,7 +76,6 @@ }; // non-member functions: -bool operator<(const error_code& lhs, const error_code& rhs) noexcept; template basic_ostream& operator<<(basic_ostream& os, const error_code& ec); @@ -102,8 +102,6 @@ explicit operator bool() const noexcept; }; -bool operator<(const error_condition& lhs, const error_condition& rhs) noexcept; - class system_error : public runtime_error { @@ -128,12 +126,16 @@ // Comparison operators: bool operator==(const error_code& lhs, const error_code& rhs) noexcept; bool operator==(const error_code& lhs, const error_condition& rhs) noexcept; -bool operator==(const error_condition& lhs, const error_code& rhs) noexcept; +bool operator==(const error_condition& lhs, const error_code& rhs) noexcept; // removed in C++20 bool operator==(const error_condition& lhs, const error_condition& rhs) noexcept; -bool operator!=(const error_code& lhs, const error_code& rhs) noexcept; -bool operator!=(const error_code& lhs, const error_condition& rhs) noexcept; -bool operator!=(const error_condition& lhs, const error_code& rhs) noexcept; -bool operator!=(const error_condition& lhs, const error_condition& rhs) noexcept; +bool operator!=(const error_code& lhs, const error_code& rhs) noexcept; // removed in C++20 +bool operator!=(const error_code& lhs, const error_condition& rhs) noexcept; // removed in C++20 +bool operator!=(const error_condition& lhs, const error_code& rhs) noexcept; // removed in C++20 +bool operator!=(const error_condition& lhs, const error_condition& rhs) noexcept; // removed in C++20 +bool operator<(const error_condition& lhs, const error_condition& rhs) noexcept; // removed in C++20 +bool operator<(const error_code& lhs, const error_code& rhs) noexcept; // removed in C++20 +strong_ordering operator<=>(const error_code& lhs, const error_code& rhs) noexcept; // C++20 +strong_ordering operator<=>(const error_condition& lhs, const error_condition& rhs) noexcept; // C++20 template <> struct hash; template <> struct hash; @@ -147,6 +149,7 @@ #include <__errc> #include <__functional/hash.h> #include <__functional/unary_function.h> +#include <__memory/addressof.h> #include #include #include @@ -223,12 +226,21 @@ _LIBCPP_INLINE_VISIBILITY bool operator==(const error_category& __rhs) const _NOEXCEPT {return this == &__rhs;} +#if _LIBCPP_STD_VER > 17 + + _LIBCPP_HIDE_FROM_ABI + strong_ordering operator<=>(const error_category& __rhs) const noexcept {return compare_three_way()(this, std::addressof(__rhs));} + +#else // _LIBCPP_STD_VER > 17 + _LIBCPP_INLINE_VISIBILITY bool operator!=(const error_category& __rhs) const _NOEXCEPT {return !(*this == __rhs);} _LIBCPP_INLINE_VISIBILITY bool operator< (const error_category& __rhs) const _NOEXCEPT {return this < &__rhs;} +#endif // _LIBCPP_STD_VER > 17 + friend class _LIBCPP_HIDDEN __do_message; }; @@ -303,14 +315,6 @@ return error_condition(static_cast(__e), generic_category()); } -inline _LIBCPP_INLINE_VISIBILITY -bool -operator<(const error_condition& __x, const error_condition& __y) _NOEXCEPT -{ - return __x.category() < __y.category() - || (__x.category() == __y.category() && __x.value() < __y.value()); -} - // error_code class _LIBCPP_TYPE_VIS error_code @@ -379,14 +383,6 @@ return error_code(static_cast(__e), generic_category()); } -inline _LIBCPP_INLINE_VISIBILITY -bool -operator<(const error_code& __x, const error_code& __y) _NOEXCEPT -{ - return __x.category() < __y.category() - || (__x.category() == __y.category() && __x.value() < __y.value()); -} - inline _LIBCPP_INLINE_VISIBILITY bool operator==(const error_code& __x, const error_code& __y) _NOEXCEPT @@ -402,12 +398,14 @@ || __y.category().equivalent(__x, __y.value()); } +#if _LIBCPP_STD_VER <= 17 inline _LIBCPP_INLINE_VISIBILITY bool operator==(const error_condition& __x, const error_code& __y) _NOEXCEPT { return __y == __x; } +#endif inline _LIBCPP_INLINE_VISIBILITY bool @@ -416,6 +414,8 @@ return __x.category() == __y.category() && __x.value() == __y.value(); } +#if _LIBCPP_STD_VER <= 17 + inline _LIBCPP_INLINE_VISIBILITY bool operator!=(const error_code& __x, const error_code& __y) _NOEXCEPT @@ -436,6 +436,42 @@ operator!=(const error_condition& __x, const error_condition& __y) _NOEXCEPT {return !(__x == __y);} +inline _LIBCPP_INLINE_VISIBILITY +bool +operator<(const error_condition& __x, const error_condition& __y) _NOEXCEPT +{ + return __x.category() < __y.category() + || (__x.category() == __y.category() && __x.value() < __y.value()); +} + +inline _LIBCPP_INLINE_VISIBILITY +bool +operator<(const error_code& __x, const error_code& __y) _NOEXCEPT +{ + return __x.category() < __y.category() + || (__x.category() == __y.category() && __x.value() < __y.value()); +} + +#else // _LIBCPP_STD_VER <= 17 + +inline _LIBCPP_HIDE_FROM_ABI strong_ordering +operator<=>(const error_code& __x, const error_code& __y) noexcept +{ + if (auto __c = __x.category() <=> __y.category(); __c != 0) + return __c; + return __x.value() <=> __y.value(); +} + +inline _LIBCPP_HIDE_FROM_ABI strong_ordering +operator<=>(const error_condition& __x, const error_condition& __y) noexcept +{ + if (auto __c = __x.category() <=> __y.category(); __c != 0) + return __c; + return __x.value() <=> __y.value(); +} + +#endif // _LIBCPP_STD_VER <= 17 + template <> struct _LIBCPP_TEMPLATE_VIS hash : public __unary_function diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_code.pass.cpp @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// class error_code + +// strong_ordering operator<=>(const error_code& lhs, const error_code& rhs) noexcept + +#include +#include + +#include "test_macros.h" +#include "test_comparisons.h" + +int main(int, char**) { + AssertOrderAreNoexcept(); + AssertOrderReturn(); + + // Same error category + std::error_code ec1a = std::error_code(1, std::generic_category()); + std::error_code ec1b = std::error_code(1, std::generic_category()); + std::error_code ec2 = std::error_code(2, std::generic_category()); + + assert(testOrder(ec1a, ec1b, std::strong_ordering::equal)); + assert(testOrder(ec1a, ec2, std::strong_ordering::less)); + + // Different error category + const std::error_code& ec3 = std::error_code(2, std::system_category()); + + bool isLess = ec2 < ec3; + assert(testOrder(ec2, ec3, isLess ? std::strong_ordering::less : std::strong_ordering::greater)); + + return 0; +} diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/cmp_error_condition.pass.cpp @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// class error_condition + +// strong_ordering operator<=>(const error_condition& lhs, const error_condition& rhs) noexcept + +#include +#include + +#include "test_macros.h" +#include "test_comparisons.h" + +int main(int, char**) { + AssertOrderAreNoexcept(); + AssertOrderReturn(); + + // Same error category + std::error_condition ec1a = std::error_condition(1, std::generic_category()); + std::error_condition ec1b = std::error_condition(1, std::generic_category()); + std::error_condition ec2 = std::error_condition(2, std::generic_category()); + + assert(testOrder(ec1a, ec1b, std::strong_ordering::equal)); + assert(testOrder(ec1a, ec2, std::strong_ordering::less)); + + // Different error category + const std::error_condition& ec3 = std::error_condition(2, std::system_category()); + + bool isLess = ec2 < ec3; + assert(testOrder(ec2, ec3, isLess ? std::strong_ordering::less : std::strong_ordering::greater)); + + return 0; +} diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.nonvirtuals/cmp.pass.cpp @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// class error_category + +// strong_ordering operator<=>(const error_category& rhs) const noexcept; + +#include +#include + +#include "test_macros.h" +#include "test_comparisons.h" + +int main(int, char**) { + AssertOrderAreNoexcept(); + AssertOrderReturn(); + + const std::error_category& e_cat1 = std::generic_category(); + const std::error_category& e_cat2 = std::generic_category(); + const std::error_category& e_cat3 = std::system_category(); + + assert(testOrder(e_cat1, e_cat2, std::strong_ordering::equal)); + + bool isLess = e_cat1 < e_cat3; + assert(testOrder(e_cat1, e_cat3, isLess ? std::strong_ordering::less : std::strong_ordering::greater)); + + return 0; +} diff --git a/libcxx/test/support/MoveOnly.h b/libcxx/test/support/MoveOnly.h --- a/libcxx/test/support/MoveOnly.h +++ b/libcxx/test/support/MoveOnly.h @@ -62,7 +62,7 @@ { typedef MoveOnly argument_type; typedef size_t result_type; - TEST_CONSTEXPR size_t operator()(const MoveOnly& x) const {return x.get();} + TEST_CONSTEXPR size_t operator()(const MoveOnly& x) const {return static_cast(x.get());} }; #endif // MOVEONLY_H diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -190,8 +190,8 @@ #define TEST_HAS_NO_EXCEPTIONS #endif -#if TEST_HAS_FEATURE(address_sanitizer) || TEST_HAS_FEATURE(memory_sanitizer) || \ - TEST_HAS_FEATURE(thread_sanitizer) +#if TEST_HAS_FEATURE(address_sanitizer) || TEST_HAS_FEATURE(hwaddress_sanitizer) || \ + TEST_HAS_FEATURE(memory_sanitizer) || TEST_HAS_FEATURE(thread_sanitizer) #define TEST_HAS_SANITIZERS #endif diff --git a/lld/test/ELF/edata-etext.s b/lld/test/ELF/edata-etext.s --- a/lld/test/ELF/edata-etext.s +++ b/lld/test/ELF/edata-etext.s @@ -37,7 +37,7 @@ ## If a relocatable object file defines non-reserved identifiers (by C and C++) ## edata/end/etext, don't redefine them. Note: GNU ld redefines the reserved -## _edata while we don't for simplicty. +## _edata while we don't for simplicity. # RUN: ld.lld %t/b.o -o %t/b # RUN: llvm-objdump -t %t/b | FileCheck %s --check-prefix=CHECK2 # RUN: ld.lld %t/c.o -o %t/c diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -844,8 +844,7 @@ {std::make_pair(&info.Languages, &languages), std::make_pair(&info.Tools, &tools), std::make_pair(&info.SDKs, &sDKs)}) for (auto &producer : *producers.first) - if (producers.second->end() == - llvm::find_if(*producers.second, + if (llvm::none_of(*producers.second, [&](std::pair seen) { return seen.first == producer.first; })) diff --git a/lldb/examples/customization/bin-utils/binutils.py b/lldb/examples/customization/bin-utils/binutils.py --- a/lldb/examples/customization/bin-utils/binutils.py +++ b/lldb/examples/customization/bin-utils/binutils.py @@ -1,7 +1,5 @@ "Collection of tools for displaying bit representation of numbers.""" -from __future__ import print_function - def binary(n, width=None): """ Return a list of (0|1)'s for the binary representation of n where n >= 0. diff --git a/lldb/examples/customization/import-python/importcmd.py b/lldb/examples/customization/import-python/importcmd.py --- a/lldb/examples/customization/import-python/importcmd.py +++ b/lldb/examples/customization/import-python/importcmd.py @@ -1,4 +1,3 @@ -from __future__ import print_function import sys import os import lldb diff --git a/lldb/examples/customization/pwd-cd-and-system/utils.py b/lldb/examples/customization/pwd-cd-and-system/utils.py --- a/lldb/examples/customization/pwd-cd-and-system/utils.py +++ b/lldb/examples/customization/pwd-cd-and-system/utils.py @@ -1,5 +1,4 @@ """Utility for changing directories and execution of commands in a subshell.""" -from __future__ import print_function import os import shlex diff --git a/lldb/examples/darwin/heap_find/heap.py b/lldb/examples/darwin/heap_find/heap.py --- a/lldb/examples/darwin/heap_find/heap.py +++ b/lldb/examples/darwin/heap_find/heap.py @@ -8,7 +8,6 @@ # (lldb) script import lldb.macosx.heap #---------------------------------------------------------------------- -from __future__ import print_function import lldb import optparse import os diff --git a/lldb/examples/python/bsd.py b/lldb/examples/python/bsd.py --- a/lldb/examples/python/bsd.py +++ b/lldb/examples/python/bsd.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import cmd import optparse diff --git a/lldb/examples/python/cmdtemplate.py b/lldb/examples/python/cmdtemplate.py --- a/lldb/examples/python/cmdtemplate.py +++ b/lldb/examples/python/cmdtemplate.py @@ -9,8 +9,6 @@ # (lldb) command script import /path/to/cmdtemplate.py # --------------------------------------------------------------------- -from __future__ import print_function - import inspect import lldb import optparse diff --git a/lldb/examples/python/delta.py b/lldb/examples/python/delta.py --- a/lldb/examples/python/delta.py +++ b/lldb/examples/python/delta.py @@ -16,8 +16,6 @@ # available. #---------------------------------------------------------------------- -from __future__ import print_function - import optparse import os import shlex diff --git a/lldb/examples/python/diagnose_nsstring.py b/lldb/examples/python/diagnose_nsstring.py --- a/lldb/examples/python/diagnose_nsstring.py +++ b/lldb/examples/python/diagnose_nsstring.py @@ -4,8 +4,6 @@ # decisions it did and providing some useful context information that can # be used for improving the formatter -from __future__ import print_function - import lldb diff --git a/lldb/examples/python/diagnose_unwind.py b/lldb/examples/python/diagnose_unwind.py --- a/lldb/examples/python/diagnose_unwind.py +++ b/lldb/examples/python/diagnose_unwind.py @@ -5,7 +5,6 @@ # information about the stack frames, and tries an alternate unwind # algorithm, that will help to understand why lldb's unwind algorithm # did not succeed. -from __future__ import print_function import optparse import lldb diff --git a/lldb/examples/python/gdbremote.py b/lldb/examples/python/gdbremote.py --- a/lldb/examples/python/gdbremote.py +++ b/lldb/examples/python/gdbremote.py @@ -16,7 +16,6 @@ # available. #---------------------------------------------------------------------- -from __future__ import print_function import binascii import subprocess import json diff --git a/lldb/examples/python/globals.py b/lldb/examples/python/globals.py --- a/lldb/examples/python/globals.py +++ b/lldb/examples/python/globals.py @@ -7,7 +7,6 @@ # For the shells sh, bash: # PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python ./globals.py [ ...] #---------------------------------------------------------------------- -from __future__ import print_function import lldb import optparse diff --git a/lldb/examples/python/jump.py b/lldb/examples/python/jump.py --- a/lldb/examples/python/jump.py +++ b/lldb/examples/python/jump.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import lldb import re diff --git a/lldb/examples/python/lldb_module_utils.py b/lldb/examples/python/lldb_module_utils.py --- a/lldb/examples/python/lldb_module_utils.py +++ b/lldb/examples/python/lldb_module_utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import lldb import optparse diff --git a/lldb/examples/python/lldbtk.py b/lldb/examples/python/lldbtk.py --- a/lldb/examples/python/lldbtk.py +++ b/lldb/examples/python/lldbtk.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import lldb import shlex diff --git a/lldb/examples/python/mach_o.py b/lldb/examples/python/mach_o.py --- a/lldb/examples/python/mach_o.py +++ b/lldb/examples/python/mach_o.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import cmd import dict_utils diff --git a/lldb/examples/python/memory.py b/lldb/examples/python/memory.py --- a/lldb/examples/python/memory.py +++ b/lldb/examples/python/memory.py @@ -9,8 +9,6 @@ # (lldb) command script import /path/to/cmdtemplate.py #---------------------------------------------------------------------- -from __future__ import print_function - import platform import os import re diff --git a/lldb/examples/python/performance.py b/lldb/examples/python/performance.py --- a/lldb/examples/python/performance.py +++ b/lldb/examples/python/performance.py @@ -8,8 +8,6 @@ # export PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python #---------------------------------------------------------------------- -from __future__ import print_function - import optparse import os import platform diff --git a/lldb/examples/python/process_events.py b/lldb/examples/python/process_events.py --- a/lldb/examples/python/process_events.py +++ b/lldb/examples/python/process_events.py @@ -8,8 +8,6 @@ # export PYTHONPATH=/Applications/Xcode.app/Contents/SharedFrameworks/LLDB.framework/Resources/Python #---------------------------------------------------------------------- -from __future__ import print_function - import optparse import os import platform diff --git a/lldb/examples/python/pytracer.py b/lldb/examples/python/pytracer.py --- a/lldb/examples/python/pytracer.py +++ b/lldb/examples/python/pytracer.py @@ -1,4 +1,3 @@ -from __future__ import print_function import sys import inspect from collections import OrderedDict diff --git a/lldb/examples/python/scripted_step.py b/lldb/examples/python/scripted_step.py --- a/lldb/examples/python/scripted_step.py +++ b/lldb/examples/python/scripted_step.py @@ -93,8 +93,6 @@ # # (lldb) thread step-scripted -C scripted_step.StepWithPlan -from __future__ import print_function - import lldb diff --git a/lldb/examples/python/shadow.py b/lldb/examples/python/shadow.py --- a/lldb/examples/python/shadow.py +++ b/lldb/examples/python/shadow.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import lldb import shlex diff --git a/lldb/examples/python/sources.py b/lldb/examples/python/sources.py --- a/lldb/examples/python/sources.py +++ b/lldb/examples/python/sources.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import lldb import shlex diff --git a/lldb/examples/python/stacks.py b/lldb/examples/python/stacks.py --- a/lldb/examples/python/stacks.py +++ b/lldb/examples/python/stacks.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import lldb import optparse import shlex diff --git a/lldb/examples/python/symbolication.py b/lldb/examples/python/symbolication.py --- a/lldb/examples/python/symbolication.py +++ b/lldb/examples/python/symbolication.py @@ -26,7 +26,6 @@ # PYTHONPATH=/path/to/LLDB.framework/Resources/Python ./crashlog.py ~/Library/Logs/DiagnosticReports/a.crash #---------------------------------------------------------------------- -from __future__ import print_function import lldb import optparse import os diff --git a/lldb/examples/python/types.py b/lldb/examples/python/types.py --- a/lldb/examples/python/types.py +++ b/lldb/examples/python/types.py @@ -9,8 +9,6 @@ # (lldb) command script import /path/to/cmdtemplate.py #---------------------------------------------------------------------- -from __future__ import print_function - import platform import os import re diff --git a/lldb/examples/scripting/tree_utils.py b/lldb/examples/scripting/tree_utils.py --- a/lldb/examples/scripting/tree_utils.py +++ b/lldb/examples/scripting/tree_utils.py @@ -18,8 +18,6 @@ http://lldb.llvm.org/scripting.html """ -from __future__ import print_function - def DFS(root, word, cur_path): """ diff --git a/lldb/examples/summaries/cocoa/CFBitVector.py b/lldb/examples/summaries/cocoa/CFBitVector.py --- a/lldb/examples/summaries/cocoa/CFBitVector.py +++ b/lldb/examples/summaries/cocoa/CFBitVector.py @@ -5,7 +5,6 @@ See https://llvm.org/LICENSE.txt for license information. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """ -from __future__ import print_function # summary provider for CF(Mutable)BitVector import lldb diff --git a/lldb/examples/summaries/cocoa/Logger.py b/lldb/examples/summaries/cocoa/Logger.py --- a/lldb/examples/summaries/cocoa/Logger.py +++ b/lldb/examples/summaries/cocoa/Logger.py @@ -1,4 +1,3 @@ -from __future__ import print_function import sys import os.path import inspect diff --git a/lldb/examples/summaries/cocoa/NSNumber.py b/lldb/examples/summaries/cocoa/NSNumber.py --- a/lldb/examples/summaries/cocoa/NSNumber.py +++ b/lldb/examples/summaries/cocoa/NSNumber.py @@ -8,8 +8,6 @@ # example summary provider for NSNumber # the real summary is now C++ code built into LLDB -from __future__ import print_function - import lldb import ctypes import lldb.runtime.objc.objc_runtime diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py --- a/lldb/examples/synthetic/gnu_libstdcpp.py +++ b/lldb/examples/synthetic/gnu_libstdcpp.py @@ -1,4 +1,3 @@ -from __future__ import division import lldb.formatters.Logger # C++ STL formatters for LLDB diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -82,6 +82,7 @@ eBroadcastBitProgress = (1 << 0), eBroadcastBitWarning = (1 << 1), eBroadcastBitError = (1 << 2), + eBroadcastSymbolChange = (1 << 3), }; static ConstString GetStaticBroadcasterClass(); @@ -430,6 +431,8 @@ llvm::Optional debugger_id = llvm::None, std::once_flag *once = nullptr); + static void ReportSymbolChange(const ModuleSpec &module_spec); + protected: friend class CommandInterpreter; friend class REPL; diff --git a/lldb/include/lldb/Core/DebuggerEvents.h b/lldb/include/lldb/Core/DebuggerEvents.h --- a/lldb/include/lldb/Core/DebuggerEvents.h +++ b/lldb/include/lldb/Core/DebuggerEvents.h @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "lldb/Core/ModuleSpec.h" #include "lldb/Utility/ConstString.h" #include "lldb/Utility/Event.h" @@ -82,6 +83,28 @@ const DiagnosticEventData &operator=(const DiagnosticEventData &) = delete; }; +class SymbolChangeEventData : public EventData { +public: + SymbolChangeEventData(lldb::DebuggerWP debugger_wp, ModuleSpec module_spec) + : m_debugger_wp(debugger_wp), m_module_spec(std::move(module_spec)) {} + + static ConstString GetFlavorString(); + ConstString GetFlavor() const override; + + static const SymbolChangeEventData * + GetEventDataFromEvent(const Event *event_ptr); + + void DoOnRemoval(Event *event_ptr) override; + +private: + lldb::DebuggerWP m_debugger_wp; + ModuleSpec m_module_spec; + + SymbolChangeEventData(const SymbolChangeEventData &) = delete; + const SymbolChangeEventData & + operator=(const SymbolChangeEventData &) = delete; +}; + } // namespace lldb_private #endif // LLDB_CORE_DEBUGGER_EVENTS_H diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h --- a/lldb/include/lldb/Core/ModuleList.h +++ b/lldb/include/lldb/Core/ModuleList.h @@ -60,6 +60,7 @@ bool SetClangModulesCachePath(const FileSpec &path); bool GetEnableExternalLookup() const; bool SetEnableExternalLookup(bool new_value); + bool GetEnableBackgroundLookup() const; bool GetEnableLLDBIndexCache() const; bool SetEnableLLDBIndexCache(bool new_value); uint64_t GetLLDBIndexCacheMaxByteSize(); @@ -457,6 +458,8 @@ static void FindSharedModules(const ModuleSpec &module_spec, ModuleList &matching_module_list); + static lldb::ModuleSP FindSharedModule(const UUID &uuid); + static size_t RemoveOrphanSharedModules(bool mandatory); static bool RemoveSharedModuleIfOrphaned(const Module *module_ptr); diff --git a/lldb/include/lldb/Symbol/LocateSymbolFile.h b/lldb/include/lldb/Symbol/LocateSymbolFile.h --- a/lldb/include/lldb/Symbol/LocateSymbolFile.h +++ b/lldb/include/lldb/Symbol/LocateSymbolFile.h @@ -14,6 +14,7 @@ #include "lldb/Core/FileSpecList.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" namespace lldb_private { @@ -52,7 +53,15 @@ // static bool DownloadObjectAndSymbolFile(ModuleSpec &module_spec, Status &error, - bool force_lookup = true); + bool force_lookup = true, + bool copy_executable = true); + + /// Locate the symbol file for the given UUID on a background thread. This + /// function returns immediately. Under the hood it uses the debugger's + /// thread pool to call DownloadObjectAndSymbolFile. If a symbol file is + /// found, this will notify all target which contain the module with the + /// given UUID. + static void DownloadSymbolFileAsync(const UUID &uuid); }; } // namespace lldb_private diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -162,7 +162,7 @@ bool GetEnableNotifyAboutFixIts() const; FileSpec GetSaveJITObjectsDir() const; - + bool GetEnableSyntheticValue() const; uint32_t GetMaxZeroPaddingInFloatFormat() const; @@ -260,7 +260,7 @@ void DisableASLRValueChangedCallback(); void InheritTCCValueChangedCallback(); void DisableSTDIOValueChangedCallback(); - + // Settings checker for target.jit-save-objects-dir: void CheckJITObjectsDir(); @@ -479,7 +479,8 @@ eBroadcastBitModulesLoaded = (1 << 1), eBroadcastBitModulesUnloaded = (1 << 2), eBroadcastBitWatchpointChanged = (1 << 3), - eBroadcastBitSymbolsLoaded = (1 << 4) + eBroadcastBitSymbolsLoaded = (1 << 4), + eBroadcastBitSymbolsChanged = (1 << 5), }; // These two functions fill out the Broadcaster interface: @@ -981,7 +982,7 @@ ModuleIsExcludedForUnconstrainedSearches(const lldb::ModuleSP &module_sp); const ArchSpec &GetArchitecture() const { return m_arch.GetSpec(); } - + /// Returns the name of the target's ABI plugin. llvm::StringRef GetABIName() const; @@ -1425,30 +1426,30 @@ LazyBool pass = eLazyBoolCalculate; LazyBool notify = eLazyBoolCalculate; LazyBool stop = eLazyBoolCalculate; - DummySignalValues(LazyBool pass, LazyBool notify, LazyBool stop) : - pass(pass), notify(notify), stop(stop) {} + DummySignalValues(LazyBool pass, LazyBool notify, LazyBool stop) + : pass(pass), notify(notify), stop(stop) {} DummySignalValues() = default; }; using DummySignalElement = llvm::StringMapEntry; - static bool UpdateSignalFromDummy(lldb::UnixSignalsSP signals_sp, - const DummySignalElement &element); - static bool ResetSignalFromDummy(lldb::UnixSignalsSP signals_sp, - const DummySignalElement &element); + static bool UpdateSignalFromDummy(lldb::UnixSignalsSP signals_sp, + const DummySignalElement &element); + static bool ResetSignalFromDummy(lldb::UnixSignalsSP signals_sp, + const DummySignalElement &element); public: /// Add a signal to the Target's list of stored signals/actions. These /// values will get copied into any processes launched from /// this target. - void AddDummySignal(llvm::StringRef name, LazyBool pass, LazyBool print, + void AddDummySignal(llvm::StringRef name, LazyBool pass, LazyBool print, LazyBool stop); /// Updates the signals in signals_sp using the stored dummy signals. /// If warning_stream_sp is not null, if any stored signals are not found in /// the current process, a warning will be emitted here. - void UpdateSignalsFromDummy(lldb::UnixSignalsSP signals_sp, + void UpdateSignalsFromDummy(lldb::UnixSignalsSP signals_sp, lldb::StreamSP warning_stream_sp); /// Clear the dummy signals in signal_names from the target, or all signals /// if signal_names is empty. Also remove the behaviors they set from the - /// process's signals if it exists. + /// process's signals if it exists. void ClearDummySignals(Args &signal_names); /// Print all the signals set in this target. void PrintDummySignals(Stream &strm, Args &signals); @@ -1533,7 +1534,7 @@ lldb::TraceSP m_trace_sp; /// Stores the frame recognizers of this target. lldb::StackFrameRecognizerManagerUP m_frame_recognizer_manager_up; - /// These are used to set the signal state when you don't have a process and + /// These are used to set the signal state when you don't have a process and /// more usefully in the Dummy target where you can't know exactly what /// signals you will have. llvm::StringMap m_dummy_signals; diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -5,6 +5,10 @@ Global, DefaultTrue, Desc<"Control the use of external tools and repositories to locate symbol files. Directories listed in target.debug-file-search-paths and directory of the executable are always checked first for separate debug info files. Then depending on this setting: On macOS, Spotlight would be also used to locate a matching .dSYM bundle based on the UUID of the executable. On NetBSD, directory /usr/libdata/debug would be also searched. On platforms other than NetBSD directory /usr/lib/debug would be also searched.">; + def EnableBackgroundLookup: Property<"enable-background-lookup", "Boolean">, + Global, + DefaultFalse, + Desc<"On macOS, enable calling dsymForUUID (or an equivalent script/binary) in the background to locate symbol files that weren't found.">; def ClangModulesCachePath: Property<"clang-modules-cache-path", "FileSpec">, Global, DefaultStringValue<"">, diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -13,6 +13,7 @@ #include "lldb/Core/FormatEntity.h" #include "lldb/Core/Mangled.h" #include "lldb/Core/ModuleList.h" +#include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" #include "lldb/Core/StreamAsynchronousIO.h" #include "lldb/Core/StreamFile.h" @@ -104,6 +105,7 @@ nullptr; // NOTE: intentional leak to avoid issues with C++ destructor chain static DebuggerList *g_debugger_list_ptr = nullptr; // NOTE: intentional leak to avoid issues with C++ destructor chain +static llvm::ThreadPool *g_thread_pool = nullptr; static constexpr OptionEnumValueElement g_show_disassembly_enum_values[] = { { @@ -538,6 +540,7 @@ "Debugger::Initialize called more than once!"); g_debugger_list_mutex_ptr = new std::recursive_mutex(); g_debugger_list_ptr = new DebuggerList(); + g_thread_pool = new llvm::ThreadPool(llvm::optimal_concurrency()); g_load_plugin_callback = load_plugin_callback; } @@ -545,6 +548,11 @@ assert(g_debugger_list_ptr && "Debugger::Terminate called without a matching Debugger::Initialize!"); + if (g_thread_pool) { + // The destructor will wait for all the threads to complete. + delete g_thread_pool; + } + if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) { // Clear our global list of debugger objects { @@ -1406,6 +1414,18 @@ debugger_id, once); } +void Debugger::ReportSymbolChange(const ModuleSpec &module_spec) { + if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) { + std::lock_guard guard(*g_debugger_list_mutex_ptr); + for (DebuggerSP debugger_sp : *g_debugger_list_ptr) { + EventSP event_sp = std::make_shared( + Debugger::eBroadcastSymbolChange, + new SymbolChangeEventData(debugger_sp, module_spec)); + debugger_sp->GetBroadcaster().BroadcastEvent(event_sp); + } + } +} + static std::shared_ptr CreateLogHandler(LogHandlerKind log_handler_kind, int fd, bool should_close, size_t buffer_size) { @@ -1702,8 +1722,8 @@ CommandInterpreter::eBroadcastBitAsynchronousErrorData); listener_sp->StartListeningForEvents( - &m_broadcaster, - eBroadcastBitProgress | eBroadcastBitWarning | eBroadcastBitError); + &m_broadcaster, eBroadcastBitProgress | eBroadcastBitWarning | + eBroadcastBitError | eBroadcastSymbolChange); // Let the thread that spawned us know that we have started up and that we // are now listening to all required events so no events get missed @@ -2005,11 +2025,7 @@ } llvm::ThreadPool &Debugger::GetThreadPool() { - // NOTE: intentional leak to avoid issues with C++ destructor chain - static llvm::ThreadPool *g_thread_pool = nullptr; - static llvm::once_flag g_once_flag; - llvm::call_once(g_once_flag, []() { - g_thread_pool = new llvm::ThreadPool(llvm::optimal_concurrency()); - }); + assert(g_thread_pool && + "Debugger::GetThreadPool called before Debugger::Initialize"); return *g_thread_pool; } diff --git a/lldb/source/Core/DebuggerEvents.cpp b/lldb/source/Core/DebuggerEvents.cpp --- a/lldb/source/Core/DebuggerEvents.cpp +++ b/lldb/source/Core/DebuggerEvents.cpp @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// #include "lldb/Core/DebuggerEvents.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Core/Module.h" #include "llvm/Support/WithColor.h" using namespace lldb_private; +using namespace lldb; template static const T *GetEventDataFromEventImpl(const Event *event_ptr) { @@ -79,3 +82,37 @@ DiagnosticEventData::GetEventDataFromEvent(const Event *event_ptr) { return GetEventDataFromEventImpl(event_ptr); } + +ConstString SymbolChangeEventData::GetFlavorString() { + static ConstString g_flavor("SymbolChangeEventData"); + return g_flavor; +} + +ConstString SymbolChangeEventData::GetFlavor() const { + return SymbolChangeEventData::GetFlavorString(); +} + +const SymbolChangeEventData * +SymbolChangeEventData::GetEventDataFromEvent(const Event *event_ptr) { + return GetEventDataFromEventImpl(event_ptr); +} + +void SymbolChangeEventData::DoOnRemoval(Event *event_ptr) { + DebuggerSP debugger_sp(m_debugger_wp.lock()); + if (!debugger_sp) + return; + + for (TargetSP target_sp : debugger_sp->GetTargetList().Targets()) { + if (ModuleSP module_sp = + target_sp->GetImages().FindModule(m_module_spec.GetUUID())) { + { + std::lock_guard guard(module_sp->GetMutex()); + if (!module_sp->GetSymbolFileFileSpec()) + module_sp->SetSymbolFileFileSpec(m_module_spec.GetSymbolFileSpec()); + } + ModuleList module_list; + module_list.Append(module_sp); + target_sp->SymbolsDidLoad(module_list); + } + } +} diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp --- a/lldb/source/Core/FormatEntity.cpp +++ b/lldb/source/Core/FormatEntity.cpp @@ -711,9 +711,6 @@ return false; } - if (valobj == nullptr) - return false; - ValueObject::ExpressionPathAftermath what_next = (do_deref_pointer ? ValueObject::eExpressionPathAftermathDereference : ValueObject::eExpressionPathAftermathNothing); @@ -1695,7 +1692,7 @@ llvm::StringRef var_representation; const char *var_name = var_value_sp->GetName().GetCString(); if (var_value_sp->GetCompilerType().IsValid()) { - if (var_value_sp && exe_scope->CalculateTarget()) + if (exe_scope && exe_scope->CalculateTarget()) var_value_sp = var_value_sp->GetQualifiedRepresentationIfAvailable( exe_scope->CalculateTarget() diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp --- a/lldb/source/Core/Module.cpp +++ b/lldb/source/Core/Module.cpp @@ -24,6 +24,7 @@ #include "lldb/Interpreter/ScriptInterpreter.h" #include "lldb/Symbol/CompileUnit.h" #include "lldb/Symbol/Function.h" +#include "lldb/Symbol/LocateSymbolFile.h" #include "lldb/Symbol/ObjectFile.h" #include "lldb/Symbol/Symbol.h" #include "lldb/Symbol/SymbolContext.h" @@ -770,7 +771,7 @@ while (i < sc_list.GetSize()) { if (!sc_list.GetContextAtIndex(i, sc)) break; - + bool keep_it = NameMatchesLookupInfo(sc.GetFunctionName(), sc.GetLanguage()); if (keep_it) @@ -1317,8 +1318,11 @@ } UnwindTable &Module::GetUnwindTable() { - if (!m_unwind_table) + if (!m_unwind_table) { m_unwind_table.emplace(*this); + if (!m_symfile_spec) + Symbols::DownloadSymbolFileAsync(GetUUID()); + } return *m_unwind_table; } diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp --- a/lldb/source/Core/ModuleList.cpp +++ b/lldb/source/Core/ModuleList.cpp @@ -106,6 +106,12 @@ nullptr, ePropertyEnableExternalLookup, new_value); } +bool ModuleListProperties::GetEnableBackgroundLookup() const { + const uint32_t idx = ePropertyEnableBackgroundLookup; + return m_collection_sp->GetPropertyAtIndexAsBoolean( + nullptr, idx, g_modulelist_properties[idx].default_uint_value != 0); +} + FileSpec ModuleListProperties::GetClangModulesCachePath() const { return m_collection_sp ->GetPropertyAtIndexAsOptionValueFileSpec(nullptr, false, @@ -768,6 +774,10 @@ GetSharedModuleList().FindModules(module_spec, matching_module_list); } +lldb::ModuleSP ModuleList::FindSharedModule(const UUID &uuid) { + return GetSharedModuleList().FindModule(uuid); +} + size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) { return GetSharedModuleList().RemoveOrphans(mandatory); } diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp --- a/lldb/source/Host/common/Editline.cpp +++ b/lldb/source/Host/common/Editline.cpp @@ -1609,7 +1609,7 @@ switch (cvt.in(state, input.begin(), input.end(), from_next, &out, &out + 1, to_next)) { case std::codecvt_base::ok: - return out != (int)WEOF; + return out != (EditLineGetCharType)WEOF; case std::codecvt_base::error: case std::codecvt_base::noconv: diff --git a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp --- a/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp +++ b/lldb/source/Plugins/Trace/intel-pt/LibiptDecoder.cpp @@ -295,7 +295,9 @@ break; case ptev_overflow: // The CPU internal buffer had an overflow error and some instructions - // were lost. + // were lost. A OVF packet comes with an FUP packet (harcoded address) + // according to the documentation, so we'll continue seeing instructions + // after this event. m_decoded_thread.AppendError(IntelPTError(-pte_overflow)); break; default: diff --git a/lldb/source/Symbol/LocateSymbolFile.cpp b/lldb/source/Symbol/LocateSymbolFile.cpp --- a/lldb/source/Symbol/LocateSymbolFile.cpp +++ b/lldb/source/Symbol/LocateSymbolFile.cpp @@ -8,6 +8,8 @@ #include "lldb/Symbol/LocateSymbolFile.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Core/Module.h" #include "lldb/Core/ModuleList.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/Progress.h" @@ -23,7 +25,9 @@ #include "lldb/Utility/Timer.h" #include "lldb/Utility/UUID.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/ThreadPool.h" // From MacOSX system header "mach/machine.h" typedef int cpu_type_t; @@ -397,6 +401,35 @@ return LocateExecutableSymbolFileDsym(module_spec); } +void Symbols::DownloadSymbolFileAsync(const UUID &uuid) { + if (!ModuleList::GetGlobalModuleListProperties().GetEnableBackgroundLookup()) + return; + + static llvm::SmallSet g_seen_uuids; + static std::mutex g_mutex; + Debugger::GetThreadPool().async([=]() { + { + std::lock_guard guard(g_mutex); + if (g_seen_uuids.count(uuid)) + return; + g_seen_uuids.insert(uuid); + } + + Status error; + ModuleSpec module_spec; + module_spec.GetUUID() = uuid; + if (!Symbols::DownloadObjectAndSymbolFile(module_spec, error, + /*force_lookup=*/true, + /*copy_executable=*/false)) + return; + + if (error.Fail()) + return; + + Debugger::ReportSymbolChange(module_spec); + }); +} + #if !defined(__APPLE__) FileSpec Symbols::FindSymbolFileInBundle(const FileSpec &symfile_bundle, @@ -407,7 +440,8 @@ } bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, - Status &error, bool force_lookup) { + Status &error, bool force_lookup, + bool copy_executable) { // Fill in the module_spec.GetFileSpec() for the object file and/or the // module_spec.GetSymbolFileSpec() for the debug symbols file. return false; diff --git a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp --- a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp +++ b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp @@ -554,7 +554,8 @@ } bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, - Status &error, bool force_lookup) { + Status &error, bool force_lookup, + bool copy_executable) { const UUID *uuid_ptr = module_spec.GetUUIDPtr(); const FileSpec *file_spec_ptr = module_spec.GetFileSpecPtr(); @@ -584,15 +585,18 @@ // Create the dsymForUUID command. StreamString command; + const char *copy_executable_arg = copy_executable ? "--copyExecutable " : ""; if (!uuid_str.empty()) { - command.Printf("%s --ignoreNegativeCache --copyExecutable %s", - dsymForUUID_exe_path.c_str(), uuid_str.c_str()); + command.Printf("%s --ignoreNegativeCache %s%s", + dsymForUUID_exe_path.c_str(), copy_executable_arg, + uuid_str.c_str()); LLDB_LOGF(log, "Calling %s with UUID %s to find dSYM: %s", dsymForUUID_exe_path.c_str(), uuid_str.c_str(), command.GetString().data()); } else if (!file_path_str.empty()) { - command.Printf("%s --ignoreNegativeCache --copyExecutable %s", - dsymForUUID_exe_path.c_str(), file_path_str.c_str()); + command.Printf("%s --ignoreNegativeCache %s%s", + dsymForUUID_exe_path.c_str(), copy_executable_arg, + file_path_str.c_str()); LLDB_LOGF(log, "Calling %s with file %s to find dSYM: %s", dsymForUUID_exe_path.c_str(), file_path_str.c_str(), command.GetString().data()); diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp --- a/lldb/source/Target/Platform.cpp +++ b/lldb/source/Target/Platform.cpp @@ -2060,10 +2060,9 @@ // the same platform supports all architectures then that's the obvious next // best thing. if (candidates.size() == archs.size()) { - if (std::all_of(candidates.begin(), candidates.end(), - [&](const PlatformSP &p) -> bool { - return p->GetName() == candidates.front()->GetName(); - })) { + if (llvm::all_of(candidates, [&](const PlatformSP &p) -> bool { + return p->GetName() == candidates.front()->GetName(); + })) { return candidates.front(); } } diff --git a/lldb/source/Utility/Event.cpp b/lldb/source/Utility/Event.cpp --- a/lldb/source/Utility/Event.cpp +++ b/lldb/source/Utility/Event.cpp @@ -124,9 +124,7 @@ } void EventDataBytes::Dump(Stream *s) const { - size_t num_printable_chars = - std::count_if(m_bytes.begin(), m_bytes.end(), llvm::isPrint); - if (num_printable_chars == m_bytes.size()) + if (llvm::all_of(m_bytes, llvm::isPrint)) s->Format("\"{0}\"", m_bytes); else s->Format("{0:$[ ]@[x-2]}", llvm::make_range( diff --git a/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py b/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py --- a/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py +++ b/lldb/test/API/api/check_public_api_headers/TestPublicAPIHeaders.py @@ -3,9 +3,6 @@ There should be nothing unwanted there and a simpe main.cpp which includes SB*.h should compile and link with the LLDB framework.""" -from __future__ import print_function - - from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil diff --git a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py --- a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py +++ b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py @@ -1,8 +1,5 @@ """Test the lldb public C++ api for returning SBCommandReturnObject.""" -from __future__ import print_function - - from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py --- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py +++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py @@ -1,8 +1,5 @@ """Test the lldb public C++ api when doing multiple debug sessions simultaneously.""" -from __future__ import print_function - - import os import lldb diff --git a/lldb/test/API/api/multiple-targets/TestMultipleTargets.py b/lldb/test/API/api/multiple-targets/TestMultipleTargets.py --- a/lldb/test/API/api/multiple-targets/TestMultipleTargets.py +++ b/lldb/test/API/api/multiple-targets/TestMultipleTargets.py @@ -1,8 +1,5 @@ """Test the lldb public C++ api when creating multiple targets simultaneously.""" -from __future__ import print_function - - import os import lldb diff --git a/lldb/test/API/api/multithreaded/TestMultithreaded.py b/lldb/test/API/api/multithreaded/TestMultithreaded.py --- a/lldb/test/API/api/multithreaded/TestMultithreaded.py +++ b/lldb/test/API/api/multithreaded/TestMultithreaded.py @@ -1,7 +1,5 @@ """Test the lldb public C++ api breakpoint callbacks.""" -from __future__ import print_function - # __package__ = "lldbsuite.test" diff --git a/lldb/test/API/arm/emulation/TestEmulations.py b/lldb/test/API/arm/emulation/TestEmulations.py --- a/lldb/test/API/arm/emulation/TestEmulations.py +++ b/lldb/test/API/arm/emulation/TestEmulations.py @@ -2,9 +2,6 @@ Test some ARM instruction emulation. """ -from __future__ import print_function - - import os import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py b/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py --- a/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py +++ b/lldb/test/API/benchmarks/continue/TestBenchmarkContinue.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbbench import * diff --git a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py --- a/lldb/test/API/benchmarks/expression/TestExpressionCmd.py +++ b/lldb/test/API/benchmarks/expression/TestExpressionCmd.py @@ -1,8 +1,5 @@ """Test lldb's expression evaluations and collect statistics.""" -from __future__ import print_function - - import sys import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py --- a/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py +++ b/lldb/test/API/benchmarks/expression/TestRepeatedExprs.py @@ -1,8 +1,5 @@ """Test evaluating expressions repeatedly comparing lldb against gdb.""" -from __future__ import print_function - - import sys import lldb from lldbsuite.test.lldbbench import BenchBase diff --git a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py --- a/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py +++ b/lldb/test/API/benchmarks/frame_variable/TestFrameVariableResponse.py @@ -1,8 +1,5 @@ """Test lldb's response time for 'frame variable' command.""" -from __future__ import print_function - - import sys import lldb from lldbsuite.test import configuration diff --git a/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py b/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py --- a/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py +++ b/lldb/test/API/benchmarks/libcxxlist/TestBenchmarkLibcxxList.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbbench import * diff --git a/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py b/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py --- a/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py +++ b/lldb/test/API/benchmarks/libcxxmap/TestBenchmarkLibcxxMap.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.lldbbench import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/benchmarks/startup/TestStartupDelays.py b/lldb/test/API/benchmarks/startup/TestStartupDelays.py --- a/lldb/test/API/benchmarks/startup/TestStartupDelays.py +++ b/lldb/test/API/benchmarks/startup/TestStartupDelays.py @@ -1,8 +1,5 @@ """Test lldb's startup delays creating a target, setting a breakpoint, and run to breakpoint stop.""" -from __future__ import print_function - - import sys import lldb from lldbsuite.test import configuration diff --git a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py --- a/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py +++ b/lldb/test/API/benchmarks/stepping/TestSteppingSpeed.py @@ -1,7 +1,5 @@ """Test lldb's stepping speed.""" -from __future__ import print_function - import sys import lldb from lldbsuite.test import configuration diff --git a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py --- a/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py +++ b/lldb/test/API/benchmarks/turnaround/TestCompileRunToBreakpointTurnaround.py @@ -1,8 +1,5 @@ """Benchmark the turnaround time starting a debugger and run to the breakpoint with lldb vs. gdb.""" -from __future__ import print_function - - import sys import lldb from lldbsuite.test.lldbbench import * diff --git a/lldb/test/API/commands/command/container/welcome.py b/lldb/test/API/commands/command/container/welcome.py --- a/lldb/test/API/commands/command/container/welcome.py +++ b/lldb/test/API/commands/command/container/welcome.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import sys diff --git a/lldb/test/API/commands/command/script/decorated.py b/lldb/test/API/commands/command/script/decorated.py --- a/lldb/test/API/commands/command/script/decorated.py +++ b/lldb/test/API/commands/command/script/decorated.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import lldb diff --git a/lldb/test/API/commands/command/script/import/bar/bar.py b/lldb/test/API/commands/command/script/import/bar/bar.py --- a/lldb/test/API/commands/command/script/import/bar/bar.py +++ b/lldb/test/API/commands/command/script/import/bar/bar.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - def bar_function(debugger, args, result, dict): global UtilityModule print(UtilityModule.barutil_function("bar told me " + args), file=result) diff --git a/lldb/test/API/commands/command/script/import/foo/bar/foobar.py b/lldb/test/API/commands/command/script/import/foo/bar/foobar.py --- a/lldb/test/API/commands/command/script/import/foo/bar/foobar.py +++ b/lldb/test/API/commands/command/script/import/foo/bar/foobar.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - def foo_function(debugger, args, result, dict): print("foobar says " + args, file=result) return None diff --git a/lldb/test/API/commands/command/script/import/foo/foo.py b/lldb/test/API/commands/command/script/import/foo/foo.py --- a/lldb/test/API/commands/command/script/import/foo/foo.py +++ b/lldb/test/API/commands/command/script/import/foo/foo.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - def foo_function(debugger, args, result, dict): print("foo says " + args, file=result) return None diff --git a/lldb/test/API/commands/command/script/import/foo/foo2.py b/lldb/test/API/commands/command/script/import/foo/foo2.py --- a/lldb/test/API/commands/command/script/import/foo/foo2.py +++ b/lldb/test/API/commands/command/script/import/foo/foo2.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - def foo2_function(debugger, args, result, dict): print("foo2 says " + args, file=result) return None diff --git a/lldb/test/API/commands/command/script/import/thepackage/__init__.py b/lldb/test/API/commands/command/script/import/thepackage/__init__.py --- a/lldb/test/API/commands/command/script/import/thepackage/__init__.py +++ b/lldb/test/API/commands/command/script/import/thepackage/__init__.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from . import TPunitA from . import TPunitB diff --git a/lldb/test/API/commands/command/script/mysto.py b/lldb/test/API/commands/command/script/mysto.py --- a/lldb/test/API/commands/command/script/mysto.py +++ b/lldb/test/API/commands/command/script/mysto.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import lldb diff --git a/lldb/test/API/commands/command/script/welcome.py b/lldb/test/API/commands/command/script/welcome.py --- a/lldb/test/API/commands/command/script/welcome.py +++ b/lldb/test/API/commands/command/script/welcome.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import sys diff --git a/lldb/test/API/commands/command/script_alias/tcsacmd.py b/lldb/test/API/commands/command/script_alias/tcsacmd.py --- a/lldb/test/API/commands/command/script_alias/tcsacmd.py +++ b/lldb/test/API/commands/command/script_alias/tcsacmd.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb diff --git a/lldb/test/API/commands/command/source/my.py b/lldb/test/API/commands/command/source/my.py --- a/lldb/test/API/commands/command/source/my.py +++ b/lldb/test/API/commands/command/source/my.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - def date(): import datetime today = datetime.date.today() diff --git a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py --- a/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py +++ b/lldb/test/API/commands/expression/no-deadlock/TestExprDoesntBlock.py @@ -2,9 +2,6 @@ Test that expr will time out and allow other threads to run if it blocks. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/commands/process/launch/TestProcessLaunch.py b/lldb/test/API/commands/process/launch/TestProcessLaunch.py --- a/lldb/test/API/commands/process/launch/TestProcessLaunch.py +++ b/lldb/test/API/commands/process/launch/TestProcessLaunch.py @@ -2,8 +2,6 @@ Test lldb process launch flags. """ -from __future__ import print_function - import os import lldb diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py --- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py +++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py @@ -2,9 +2,6 @@ Test the 'register' command. """ -from __future__ import print_function - - import os import sys import lldb diff --git a/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py b/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py --- a/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py +++ b/lldb/test/API/commands/watchpoints/multiple_threads/TestWatchpointMultipleThreads.py @@ -2,9 +2,6 @@ Test that lldb watchpoint works for multiple threads. """ -from __future__ import print_function - - import re import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py b/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py --- a/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py +++ b/lldb/test/API/commands/watchpoints/watchpoint_events/TestWatchpointEvents.py @@ -1,8 +1,5 @@ """Test that adding, deleting and modifying watchpoints sends the appropriate events.""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommandsFromPython.py @@ -2,9 +2,6 @@ Test that you can set breakpoint commands successfully with the Python API's: """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py --- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/bktptcmd.py @@ -1,4 +1,3 @@ -from __future__ import print_function import side_effect def useless_function(first, second): diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py b/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py --- a/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_in_delayslot/TestAvoidBreakpointInDelaySlot.py @@ -2,8 +2,6 @@ Test specific to MIPS """ -from __future__ import print_function - import re import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py b/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py --- a/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py +++ b/lldb/test/API/functionalities/breakpoint/cpp/TestCPPBreakpointLocations.py @@ -2,9 +2,6 @@ Test lldb breakpoint ids. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py b/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py --- a/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py +++ b/lldb/test/API/functionalities/breakpoint/move_nearest/TestMoveNearest.py @@ -1,6 +1,3 @@ -from __future__ import print_function - - import lldb from lldbsuite.test.lldbtest import * import lldbsuite.test.lldbutil as lldbutil diff --git a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py --- a/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py +++ b/lldb/test/API/functionalities/conditional_break/TestConditionalBreak.py @@ -2,9 +2,6 @@ Test conditionally break on a function and inspect its variables. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSNumber.py @@ -3,8 +3,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py --- a/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-python-synth/TestDataFormatterPythonSynth.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py b/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py --- a/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-synthval/TestDataFormatterSynthVal.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py b/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py --- a/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py +++ b/lldb/test/API/functionalities/data-formatter/dump_dynamic/TestDumpDynamic.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from lldbsuite.test import lldbinline lldbinline.MakeInlineTest( diff --git a/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py b/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py --- a/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py +++ b/lldb/test/API/functionalities/data-formatter/vector-types/TestVectorTypesFormatting.py @@ -2,9 +2,6 @@ Check that vector types format properly """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/exec/TestExec.py b/lldb/test/API/functionalities/exec/TestExec.py --- a/lldb/test/API/functionalities/exec/TestExec.py +++ b/lldb/test/API/functionalities/exec/TestExec.py @@ -1,8 +1,6 @@ """ Test some lldb command abbreviations. """ -from __future__ import print_function - import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegOffsets.py @@ -1,4 +1,3 @@ -from __future__ import print_function from textwrap import dedent import lldb from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py b/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestArmRegisterDefinition.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestFork.py b/lldb/test/API/functionalities/gdb_remote_client/TestFork.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestFork.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestFork.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import unittest from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerNoTargetXML.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBServerTargetXML.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestJLink6Armv7RegisterDefinition.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py b/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestMultiprocess.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import unittest from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py b/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestNestedRegDefinitions.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py b/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestNoGPacketSupported.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestNoWatchpointSupportInfo.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py b/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestPartialGPacket.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py b/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestRegDefinitionInParts.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import time from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py b/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestRemoteRegNums.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestRestartBug.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py b/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py --- a/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestTargetXMLArch.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py b/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py --- a/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py +++ b/lldb/test/API/functionalities/inferior-assert/TestInferiorAssert.py @@ -1,8 +1,5 @@ """Test that lldb functions correctly after the inferior has asserted.""" -from __future__ import print_function - - import lldb from lldbsuite.test import lldbutil from lldbsuite.test import lldbplatformutil diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py --- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py +++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py @@ -2,9 +2,6 @@ Test that breakpoint by symbol name works correctly with dynamic libs. """ -from __future__ import print_function - - import os import re import lldb diff --git a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py --- a/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py +++ b/lldb/test/API/functionalities/multidebugger_commands/TestMultipleDebuggersCommands.py @@ -2,9 +2,6 @@ Test that commands do not try and hold on to stale CommandInterpreters in a multiple debuggers scenario """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py b/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py --- a/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py +++ b/lldb/test/API/functionalities/plugins/command_plugin/TestPluginCommands.py @@ -2,9 +2,6 @@ Test that plugins that load commands work correctly. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py --- a/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py +++ b/lldb/test/API/functionalities/plugins/python_os_plugin/stepping_plugin_threads/TestOSPluginStepping.py @@ -3,9 +3,6 @@ all threads at every stop. """ -from __future__ import print_function - - import os import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py --- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py +++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py @@ -2,8 +2,6 @@ Test basics of linux core file debugging. """ -from __future__ import division, print_function - import shutil import struct import os diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py --- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py +++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py @@ -2,8 +2,6 @@ Test NetBSD core file debugging. """ -from __future__ import division, print_function - import signal import os diff --git a/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py b/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py --- a/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py +++ b/lldb/test/API/functionalities/recursion/TestValueObjectRecursion.py @@ -2,9 +2,6 @@ Test lldb data formatter subsystem. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py --- a/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py +++ b/lldb/test/API/functionalities/target-new-solib-notifications/TestModuleLoadedNotifys.py @@ -3,9 +3,6 @@ they should be delivered in batches instead of one-by-one. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/tty/TestTerminal.py b/lldb/test/API/functionalities/tty/TestTerminal.py --- a/lldb/test/API/functionalities/tty/TestTerminal.py +++ b/lldb/test/API/functionalities/tty/TestTerminal.py @@ -2,9 +2,6 @@ Test lldb command aliases. """ -from __future__ import print_function - - import unittest2 import os import lldb diff --git a/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py b/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py --- a/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py +++ b/lldb/test/API/functionalities/unwind/noreturn/TestNoreturnUnwind.py @@ -2,9 +2,6 @@ Test that we can backtrace correctly with 'noreturn' functions on the stack """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py --- a/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py +++ b/lldb/test/API/functionalities/unwind/sigtramp/TestSigtrampUnwind.py @@ -2,9 +2,6 @@ Test that we can backtrace correctly with 'sigtramp' functions on the stack """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py --- a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py +++ b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py @@ -21,9 +21,6 @@ when using API directly, for example in LLDB-MI. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py b/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py --- a/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py +++ b/lldb/test/API/lang/c/step_over_no_deadlock/TestStepOverDoesntBlock.py @@ -2,9 +2,6 @@ Test that step over will let other threads run when necessary """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py --- a/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py +++ b/lldb/test/API/lang/cpp/class_types/TestClassTypesDisassembly.py @@ -2,9 +2,6 @@ Test the lldb disassemble command on each call frame when stopped on C's ctor. """ -from __future__ import print_function - - import os import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py b/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py --- a/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py +++ b/lldb/test/API/lang/cpp/dynamic-value/TestCppValueCast.py @@ -2,9 +2,6 @@ Test lldb Python API SBValue::Cast(SBType) for C++ types. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py --- a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py +++ b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py @@ -2,9 +2,6 @@ Test the lldb disassemble command on lib stdc++. """ -from __future__ import print_function - - import os import lldb from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py b/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py --- a/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py +++ b/lldb/test/API/lang/objc/direct-dispatch-step/TestObjCDirectDispatchStepping.py @@ -1,8 +1,5 @@ """Test stepping through ObjC method dispatch in various forms.""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/objc/foundation/TestObjCMethods.py b/lldb/test/API/lang/objc/foundation/TestObjCMethods.py --- a/lldb/test/API/lang/objc/foundation/TestObjCMethods.py +++ b/lldb/test/API/lang/objc/foundation/TestObjCMethods.py @@ -3,9 +3,6 @@ Also lookup objective-c data types and evaluate expressions. """ -from __future__ import print_function - - import os import os.path import lldb diff --git a/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py b/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py --- a/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py +++ b/lldb/test/API/lang/objc/foundation/TestObjectDescriptionAPI.py @@ -2,9 +2,6 @@ Test SBValue.GetObjectDescription() with the value from SBTarget.FindGlobalVariables(). """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py --- a/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py +++ b/lldb/test/API/lang/objc/objc-class-method/TestObjCClassMethod.py @@ -1,8 +1,5 @@ """Test calling functions in class methods.""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py b/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py --- a/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py +++ b/lldb/test/API/lang/objc/objc-stepping/TestObjCStepping.py @@ -1,8 +1,5 @@ """Test stepping through ObjC method dispatch in various forms.""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py --- a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py +++ b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py @@ -2,9 +2,6 @@ Test "print object" where another thread blocks the print object from making progress. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/lldbtest.py b/lldb/test/API/lldbtest.py --- a/lldb/test/API/lldbtest.py +++ b/lldb/test/API/lldbtest.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import os import re import operator diff --git a/lldb/test/API/macosx/queues/TestQueues.py b/lldb/test/API/macosx/queues/TestQueues.py --- a/lldb/test/API/macosx/queues/TestQueues.py +++ b/lldb/test/API/macosx/queues/TestQueues.py @@ -1,8 +1,5 @@ """Test queues inspection SB APIs.""" -from __future__ import print_function - - import os import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py b/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py --- a/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py +++ b/lldb/test/API/python_api/default-constructor/TestDefaultConstructorForAPIObjects.py @@ -11,9 +11,6 @@ after default construction. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py b/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py --- a/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py +++ b/lldb/test/API/python_api/disassemble-raw-data/TestDisassembleRawData.py @@ -2,9 +2,6 @@ Use lldb Python API to disassemble raw machine code bytes """ -from __future__ import print_function - - import re import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py b/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py --- a/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py +++ b/lldb/test/API/python_api/disassemble-raw-data/TestDisassemble_VST1_64.py @@ -2,8 +2,6 @@ Use lldb Python API to disassemble raw machine code bytes """ -from __future__ import print_function - from io import StringIO import sys diff --git a/lldb/test/API/python_api/event/TestEvents.py b/lldb/test/API/python_api/event/TestEvents.py --- a/lldb/test/API/python_api/event/TestEvents.py +++ b/lldb/test/API/python_api/event/TestEvents.py @@ -2,9 +2,6 @@ Test lldb Python event APIs. """ -from __future__ import print_function - - import re import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py --- a/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py +++ b/lldb/test/API/python_api/formatters/TestFormattersSBAPI.py @@ -1,8 +1,5 @@ """Test Python APIs for working with formatters""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/frame/TestFrames.py b/lldb/test/API/python_api/frame/TestFrames.py --- a/lldb/test/API/python_api/frame/TestFrames.py +++ b/lldb/test/API/python_api/frame/TestFrames.py @@ -3,8 +3,6 @@ And other SBFrame API tests. """ -from __future__ import print_function - import io import lldb diff --git a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py --- a/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py +++ b/lldb/test/API/python_api/frame/get-variables/TestGetVariables.py @@ -2,9 +2,6 @@ Test that SBFrame::GetVariables() calls work correctly. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py --- a/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py +++ b/lldb/test/API/python_api/frame/inlines/TestInlinedFrame.py @@ -2,9 +2,6 @@ Testlldb Python SBFrame APIs IsInlined() and GetFunctionName(). """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py --- a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py +++ b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py @@ -2,9 +2,6 @@ Test retrieval of SBAddress from function/symbol, disassembly, and SBAddress APIs. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py --- a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py +++ b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py @@ -2,9 +2,6 @@ Test newly added SBSymbol and SBAddress APIs. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py b/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py --- a/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py +++ b/lldb/test/API/python_api/interpreter/TestCommandInterpreterAPI.py @@ -1,8 +1,5 @@ """Test the SBCommandInterpreter APIs.""" -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py b/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py --- a/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py +++ b/lldb/test/API/python_api/lldbutil/frame/TestFrameUtils.py @@ -2,9 +2,6 @@ Test utility functions for the frame object. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py b/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py --- a/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py +++ b/lldb/test/API/python_api/lldbutil/iter/TestLLDBIterator.py @@ -2,9 +2,6 @@ Test the iteration protocol for some lldb container objects. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py b/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py --- a/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py +++ b/lldb/test/API/python_api/lldbutil/iter/TestRegistersIterator.py @@ -2,9 +2,6 @@ Test the iteration protocol for frame registers. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/module_section/TestModuleAndSection.py b/lldb/test/API/python_api/module_section/TestModuleAndSection.py --- a/lldb/test/API/python_api/module_section/TestModuleAndSection.py +++ b/lldb/test/API/python_api/module_section/TestModuleAndSection.py @@ -2,9 +2,6 @@ Test some SBModule and SBSection APIs. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/process/TestProcessAPI.py b/lldb/test/API/python_api/process/TestProcessAPI.py --- a/lldb/test/API/python_api/process/TestProcessAPI.py +++ b/lldb/test/API/python_api/process/TestProcessAPI.py @@ -2,9 +2,6 @@ Test SBProcess APIs, including ReadMemory(), WriteMemory(), and others. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/process/io/TestProcessIO.py b/lldb/test/API/python_api/process/io/TestProcessIO.py --- a/lldb/test/API/python_api/process/io/TestProcessIO.py +++ b/lldb/test/API/python_api/process/io/TestProcessIO.py @@ -1,8 +1,5 @@ """Test Python APIs for process IO.""" -from __future__ import print_function - - import os import lldb from lldbsuite.test.decorators import * diff --git a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py --- a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py +++ b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py @@ -2,8 +2,6 @@ Test SBSymbolContext APIs. """ -from __future__ import print_function - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py --- a/lldb/test/API/python_api/target/TestTargetAPI.py +++ b/lldb/test/API/python_api/target/TestTargetAPI.py @@ -2,9 +2,6 @@ Test SBTarget APIs. """ -from __future__ import print_function - - import unittest2 import os import lldb diff --git a/lldb/test/API/python_api/thread/TestThreadAPI.py b/lldb/test/API/python_api/thread/TestThreadAPI.py --- a/lldb/test/API/python_api/thread/TestThreadAPI.py +++ b/lldb/test/API/python_api/thread/TestThreadAPI.py @@ -2,9 +2,6 @@ Test SBThread APIs. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/type/TestTypeList.py b/lldb/test/API/python_api/type/TestTypeList.py --- a/lldb/test/API/python_api/type/TestTypeList.py +++ b/lldb/test/API/python_api/type/TestTypeList.py @@ -2,10 +2,6 @@ Test SBType and SBTypeList API. """ -from __future__ import print_function - - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py --- a/lldb/test/API/python_api/value/TestValueAPI.py +++ b/lldb/test/API/python_api/value/TestValueAPI.py @@ -2,9 +2,6 @@ Test some SBValue APIs. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py b/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py --- a/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py +++ b/lldb/test/API/python_api/value/linked_list/TestValueAPILinkedList.py @@ -3,9 +3,6 @@ supports iteration till the end of list is reached. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py b/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py --- a/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py +++ b/lldb/test/API/python_api/watchpoint/TestSetWatchpoint.py @@ -2,9 +2,6 @@ Use lldb Python SBValue API to create a watchpoint for read_write of 'globl' var. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py b/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py --- a/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py +++ b/lldb/test/API/python_api/watchpoint/TestWatchpointIgnoreCount.py @@ -2,9 +2,6 @@ Use lldb Python SBWatchpoint API to set the ignore count. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py --- a/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py +++ b/lldb/test/API/python_api/watchpoint/TestWatchpointIter.py @@ -2,10 +2,6 @@ Use lldb Python SBTarget API to iterate on the watchpoint(s) for the target. """ -from __future__ import print_function - - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py b/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py --- a/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py +++ b/lldb/test/API/python_api/watchpoint/condition/TestWatchpointConditionAPI.py @@ -2,9 +2,6 @@ Test watchpoint condition API. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py b/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py --- a/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py +++ b/lldb/test/API/python_api/watchpoint/watchlocation/TestSetWatchlocation.py @@ -2,10 +2,6 @@ Use lldb Python SBValue.WatchPointee() API to create a watchpoint for write of '*g_char_ptr'. """ -from __future__ import print_function - - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py --- a/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py +++ b/lldb/test/API/python_api/watchpoint/watchlocation/TestTargetWatchAddress.py @@ -2,9 +2,6 @@ Use lldb Python SBtarget.WatchAddress() API to create a watchpoint for write of '*g_char_ptr'. """ -from __future__ import print_function - - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/qemu/TestQemuAPI.py b/lldb/test/API/qemu/TestQemuAPI.py --- a/lldb/test/API/qemu/TestQemuAPI.py +++ b/lldb/test/API/qemu/TestQemuAPI.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import os from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/qemu/TestQemuLaunch.py b/lldb/test/API/qemu/TestQemuLaunch.py --- a/lldb/test/API/qemu/TestQemuLaunch.py +++ b/lldb/test/API/qemu/TestQemuLaunch.py @@ -1,4 +1,3 @@ -from __future__ import print_function import lldb import unittest import os diff --git a/lldb/test/API/sample_test/TestSampleInlineTest.py b/lldb/test/API/sample_test/TestSampleInlineTest.py --- a/lldb/test/API/sample_test/TestSampleInlineTest.py +++ b/lldb/test/API/sample_test/TestSampleInlineTest.py @@ -2,8 +2,6 @@ Describe the purpose of the test here. """ -from __future__ import absolute_import - from lldbsuite.test import lldbinline lldbinline.MakeInlineTest( diff --git a/lldb/test/API/source-manager/TestSourceManager.py b/lldb/test/API/source-manager/TestSourceManager.py --- a/lldb/test/API/source-manager/TestSourceManager.py +++ b/lldb/test/API/source-manager/TestSourceManager.py @@ -9,8 +9,6 @@ Test the caching mechanism of the source manager. """ -from __future__ import print_function - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py --- a/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py +++ b/lldb/test/API/terminal/TestSTTYBeforeAndAfter.py @@ -2,9 +2,6 @@ Test that 'stty -a' displays the same output before and after running the lldb command. """ -from __future__ import print_function - - import lldb import io import sys diff --git a/lldb/test/API/test_runner/test/inferior.py b/lldb/test/API/test_runner/test/inferior.py --- a/lldb/test/API/test_runner/test/inferior.py +++ b/lldb/test/API/test_runner/test/inferior.py @@ -1,8 +1,6 @@ #!/usr/bin/env python """Inferior program used by process control tests.""" -from __future__ import print_function - import argparse import datetime import signal diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py --- a/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteHostInfo.py @@ -1,5 +1,3 @@ -from __future__ import print_function - # lldb test suite imports from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import TestBase diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py --- a/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py +++ b/lldb/test/API/tools/lldb-server/TestGdbRemotePlatformFile.py @@ -1,5 +1,3 @@ -from __future__ import print_function - # lldb test suite imports from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import TestBase diff --git a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py --- a/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py +++ b/lldb/test/API/tools/lldb-server/commandline/TestGdbRemoteConnection.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import gdbremote_testcase import random import select diff --git a/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py b/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py --- a/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py +++ b/lldb/test/API/tools/lldb-vscode/console/TestVSCode_console.py @@ -2,8 +2,6 @@ Test lldb-vscode setBreakpoints request """ -from __future__ import print_function - import vscode from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py b/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py --- a/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py +++ b/lldb/test/API/tools/lldb-vscode/correct-thread/TestVSCode_correct_thread.py @@ -2,8 +2,6 @@ Test lldb-vscode setBreakpoints request """ -from __future__ import print_function - import vscode from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py --- a/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py +++ b/lldb/test/API/tools/lldb-vscode/module/TestVSCode_module.py @@ -2,8 +2,6 @@ Test lldb-vscode setBreakpoints request """ -from __future__ import print_function - import vscode from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py b/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py --- a/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py +++ b/lldb/test/API/tools/lldb-vscode/optimized/TestVSCode_optimized.py @@ -2,8 +2,6 @@ Test lldb-vscode variables/stackTrace request for optimized code """ -from __future__ import print_function - import vscode from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py b/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py --- a/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py +++ b/lldb/test/API/tools/lldb-vscode/variables/TestVSCode_variables.py @@ -2,8 +2,6 @@ Test lldb-vscode setBreakpoints request """ -from __future__ import print_function - import vscode from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * diff --git a/lldb/test/API/types/AbstractBase.py b/lldb/test/API/types/AbstractBase.py --- a/lldb/test/API/types/AbstractBase.py +++ b/lldb/test/API/types/AbstractBase.py @@ -2,8 +2,6 @@ Abstract base class of basic types provides a generic type tester method. """ -from __future__ import print_function - import os import re import lldb diff --git a/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py b/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py --- a/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py +++ b/lldb/test/Shell/Commands/CommandScriptImmediateOutput/Inputs/custom_command.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import sys diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py --- a/lldb/test/Shell/helper/build.py +++ b/lldb/test/Shell/helper/build.py @@ -1,7 +1,5 @@ #!/usr/bin/env python -from __future__ import print_function - import argparse import os import shutil diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -339,6 +339,12 @@ set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" ) +# LLVM_INSTALL_PACKAGE_DIR needs to be declared prior to adding the tools +# subdirectory in order to have the value available for llvm-config. +include(GNUInstallPackageDir) +set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING + "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')") + set(LLVM_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE STRING "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')") mark_as_advanced(LLVM_TOOLS_INSTALL_DIR) @@ -1141,6 +1147,9 @@ add_subdirectory(utils/UnicodeData) add_subdirectory(utils/yaml-bench) add_subdirectory(utils/split-file) + if( LLVM_INCLUDE_TESTS ) + add_subdirectory(utils/unittest) + endif() else() if ( LLVM_INCLUDE_TESTS ) message(FATAL_ERROR "Including tests when not building utils will not work. @@ -1185,9 +1194,6 @@ add_subdirectory(utils/lit) add_subdirectory(test) add_subdirectory(unittests) - if( LLVM_INCLUDE_UTILS ) - add_subdirectory(utils/unittest) - endif() if (WIN32) # This utility is used to prevent crashing tests from calling Dr. Watson on diff --git a/llvm/cmake/modules/CMakeLists.txt b/llvm/cmake/modules/CMakeLists.txt --- a/llvm/cmake/modules/CMakeLists.txt +++ b/llvm/cmake/modules/CMakeLists.txt @@ -1,10 +1,7 @@ -include(GNUInstallPackageDir) include(ExtendPath) include(LLVMDistributionSupport) include(FindPrefixFromConfig) -set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING - "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')") # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below. set(llvm_cmake_builddir "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm") diff --git a/llvm/cmake/modules/FindGRPC.cmake b/llvm/cmake/modules/FindGRPC.cmake --- a/llvm/cmake/modules/FindGRPC.cmake +++ b/llvm/cmake/modules/FindGRPC.cmake @@ -132,7 +132,7 @@ ARGS ${Flags} "${ProtoSourceAbsolutePath}" DEPENDS "${ProtoSourceAbsolutePath}") - add_clang_library(${LibraryName} ${GeneratedProtoSource} + add_llvm_library(${LibraryName} ${GeneratedProtoSource} PARTIAL_SOURCES_INTENDED LINK_LIBS PUBLIC grpc++ protobuf) diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -794,6 +794,9 @@ # Prevent bugs that can happen with llvm's brace style. add_flag_if_supported("-Wmisleading-indentation" MISLEADING_INDENTATION_FLAG) + + # Enable -Wctad-maybe-unsupported to catch unintended use of CTAD. + add_flag_if_supported("-Wctad-maybe-unsupported" CTAD_MAYBE_UNSPPORTED_FLAG) endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)) if (LLVM_COMPILER_IS_GCC_COMPATIBLE AND NOT LLVM_ENABLE_WARNINGS) diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst --- a/llvm/docs/CommandGuide/llvm-objdump.rst +++ b/llvm/docs/CommandGuide/llvm-objdump.rst @@ -312,6 +312,10 @@ Disassemble just the specified symbol's instructions. +.. option:: --chained-fixups + + Print chained fixup information. + .. option:: --dyld_info Print bind and rebase information used by dyld to resolve external diff --git a/llvm/docs/CommandGuide/llvm-otool.rst b/llvm/docs/CommandGuide/llvm-otool.rst --- a/llvm/docs/CommandGuide/llvm-otool.rst +++ b/llvm/docs/CommandGuide/llvm-otool.rst @@ -23,6 +23,10 @@ Select slice of universal Mach-O file. +.. option:: -chained_fixups + + Print chained fixup information. + .. option:: -C Print linker optimization hints. diff --git a/llvm/include/llvm/ADT/Optional.h b/llvm/include/llvm/ADT/Optional.h --- a/llvm/include/llvm/ADT/Optional.h +++ b/llvm/include/llvm/ADT/Optional.h @@ -348,6 +348,7 @@ return None; } template + LLVM_DEPRECATED("Use transform instead.", "transform") auto map(const Function &F) const & -> Optional { if (*this) return F(value()); @@ -378,6 +379,7 @@ return None; } template + LLVM_DEPRECATED("Use transform instead.", "transform") auto map(const Function &F) && -> Optional { if (*this) diff --git a/llvm/include/llvm/ADT/SmallSet.h b/llvm/include/llvm/ADT/SmallSet.h --- a/llvm/include/llvm/ADT/SmallSet.h +++ b/llvm/include/llvm/ADT/SmallSet.h @@ -141,6 +141,7 @@ std::set Set; using VIterator = typename SmallVector::const_iterator; + using SIterator = typename std::set::const_iterator; using mutable_iterator = typename SmallVector::iterator; // In small mode SmallPtrSet uses linear search for the elements, so it is @@ -171,22 +172,21 @@ } /// insert - Insert an element into the set if it isn't already there. - /// Returns true if the element is inserted (it was not in the set before). - /// The first value of the returned pair is unused and provided for - /// partial compatibility with the standard library self-associative container - /// concept. - // FIXME: Add iterators that abstract over the small and large form, and then - // return those here. - std::pair insert(const T &V) { - if (!isSmall()) - return std::make_pair(None, Set.insert(V).second); + /// Returns a pair. The first value of it is an iterator to the inserted + /// element or the existing element in the set. The second value is true + /// if the element is inserted (it was not in the set before). + std::pair insert(const T &V) { + if (!isSmall()) { + auto [I, Inserted] = Set.insert(V); + return std::make_pair(const_iterator(I), Inserted); + } VIterator I = vfind(V); if (I != Vector.end()) // Don't reinsert if it already exists. - return std::make_pair(None, false); + return std::make_pair(const_iterator(I), false); if (Vector.size() < N) { Vector.push_back(V); - return std::make_pair(None, true); + return std::make_pair(const_iterator(std::prev(Vector.end())), true); } // Otherwise, grow from vector to set. @@ -194,8 +194,7 @@ Set.insert(Vector.back()); Vector.pop_back(); } - Set.insert(V); - return std::make_pair(None, true); + return std::make_pair(const_iterator(Set.insert(V).first), true); } template diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h --- a/llvm/include/llvm/ADT/SmallVector.h +++ b/llvm/include/llvm/ADT/SmallVector.h @@ -92,8 +92,8 @@ template using SmallVectorSizeType = - typename std::conditional= 8, uint64_t, - uint32_t>::type; + std::conditional_t= 8, uint64_t, + uint32_t>; /// Figure out the offset of the first element. template struct SmallVectorAlignmentAndSize { diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h --- a/llvm/include/llvm/Analysis/RegionInfoImpl.h +++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h @@ -390,10 +390,10 @@ template void RegionBase::addSubRegion(RegionT *SubRegion, bool moveChildren) { assert(!SubRegion->parent && "SubRegion already has a parent!"); - assert(llvm::find_if(*this, + assert(llvm::none_of(*this, [&](const std::unique_ptr &R) { return R.get() == SubRegion; - }) == children.end() && + }) && "Subregion already exists!"); SubRegion->parent = static_cast(this); diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -529,11 +529,9 @@ bool containsAddRecurrence(const SCEV *S); /// Is operation \p BinOp between \p LHS and \p RHS provably does not have - /// a signed/unsigned overflow (\p Signed)? If \p CtxI is specified, the - /// no-overflow fact should be true in the context of this instruction. + /// a signed/unsigned overflow (\p Signed)? bool willNotOverflow(Instruction::BinaryOps BinOp, bool Signed, - const SCEV *LHS, const SCEV *RHS, - const Instruction *CtxI = nullptr); + const SCEV *LHS, const SCEV *RHS); /// Parse NSW/NUW flags from add/sub/mul IR binary operation \p Op into /// SCEV no-wrap flags, and deduce flag[s] that aren't known yet. diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h --- a/llvm/include/llvm/BinaryFormat/MachO.h +++ b/llvm/include/llvm/BinaryFormat/MachO.h @@ -1002,6 +1002,19 @@ uint64_t n_value; }; +// Values for dyld_chained_fixups_header::imports_format. +enum { + DYLD_CHAINED_IMPORT = 1, + DYLD_CHAINED_IMPORT_ADDEND = 2, + DYLD_CHAINED_IMPORT_ADDEND64 = 3, +}; + +// Values for dyld_chained_fixups_header::symbols_format. +enum { + DYLD_CHAINED_SYMBOL_UNCOMPRESSED = 0, + DYLD_CHAINED_SYMBOL_ZLIB = 1, +}; + /// Structs for dyld chained fixups. /// dyld_chained_fixups_header is the data pointed to by LC_DYLD_CHAINED_FIXUPS /// load command. diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h --- a/llvm/include/llvm/CodeGen/RDFGraph.h +++ b/llvm/include/llvm/CodeGen/RDFGraph.h @@ -934,6 +934,8 @@ const DataFlowGraph &G; }; + template Print(const T &, const DataFlowGraph &) -> Print; + template struct PrintNode : Print> { PrintNode(const NodeAddr &x, const DataFlowGraph &g) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_i386.h @@ -0,0 +1,39 @@ +//===--- ELF_i386.h - JIT link functions for ELF/i386 --*- C++ -*----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +// jit-link functions for ELF/i386. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H +#define LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +namespace llvm { +namespace jitlink { + +/// Create a LinkGraph from an ELF/i386 relocatable object +/// +/// Note: The graph does not take ownership of the underlying buffer, nor copy +/// its contents. The caller is responsible for ensuring that the object buffer +/// outlives the graph. +Expected> +createLinkGraphFromELFObject_i386(MemoryBufferRef ObjectBuffer); + +/// jit-link the given object buffer, which must be a ELF i386 relocatable +/// object file. +void link_ELF_i386(std::unique_ptr G, + std::unique_ptr Ctx); + +} // end namespace jitlink +} // end namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_I386_H diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -1004,10 +1004,10 @@ /// Create a section with the given name, protection flags, and alignment. Section &createSection(StringRef Name, MemProt Prot) { - assert(llvm::find_if(Sections, + assert(llvm::none_of(Sections, [&](std::unique_ptr
&Sec) { return Sec->getName() == Name; - }) == Sections.end() && + }) && "Duplicate section name"); std::unique_ptr
Sec(new Section(Name, Prot, Sections.size())); Sections.push_back(std::move(Sec)); @@ -1349,9 +1349,8 @@ assert(ExternalSymbols.count(&Sym) && "Symbol is not in the externals set"); ExternalSymbols.erase(&Sym); Addressable &Base = *Sym.Base; - assert(llvm::find_if(ExternalSymbols, - [&](Symbol *AS) { return AS->Base == &Base; }) == - ExternalSymbols.end() && + assert(llvm::none_of(ExternalSymbols, + [&](Symbol *AS) { return AS->Base == &Base; }) && "Base addressable still in use"); destroySymbol(Sym); destroyAddressable(Base); @@ -1365,9 +1364,8 @@ "Symbol is not in the absolute symbols set"); AbsoluteSymbols.erase(&Sym); Addressable &Base = *Sym.Base; - assert(llvm::find_if(ExternalSymbols, - [&](Symbol *AS) { return AS->Base == &Base; }) == - ExternalSymbols.end() && + assert(llvm::none_of(ExternalSymbols, + [&](Symbol *AS) { return AS->Base == &Base; }) && "Base addressable still in use"); destroySymbol(Sym); destroyAddressable(Base); diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h @@ -0,0 +1,38 @@ +//=== i386.h - Generic JITLink i386 edge kinds, utilities -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic utilities for graphs representing i386 objects. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_I386_H +#define LLVM_EXECUTIONENGINE_JITLINK_I386_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +namespace llvm { +namespace jitlink { +namespace i386 { + +/// Represets i386 fixups +enum EdgeKind_i386 : Edge::Kind { + + /// None + None = Edge::FirstRelocation, + +}; + +/// Returns a string name for the given i386 edge. For debugging purposes +/// only +const char *getEdgeKindName(Edge::Kind K); + +} // namespace i386 +} // namespace jitlink +} // namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_I386_H \ No newline at end of file diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h --- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -18,6 +18,7 @@ #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/Orc/Core.h" #include "llvm/ExecutionEngine/Orc/Mangling.h" +#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcError.h" #include "llvm/ExecutionEngine/RuntimeDyld.h" #include "llvm/Object/Archive.h" @@ -314,6 +315,40 @@ DenseMap ObjectFilesMap; }; +/// A utility class to create COFF dllimport GOT symbols (__imp_*) and PLT +/// stubs. +/// +/// If an instance of this class is attached to a JITDylib as a fallback +/// definition generator, PLT stubs and dllimport __imp_ symbols will be +/// generated for external symbols found outside the given jitdylib. Currently +/// only supports x86_64 architecture. +class DLLImportDefinitionGenerator : public DefinitionGenerator { +public: + /// Creates a DLLImportDefinitionGenerator instance. + static std::unique_ptr + Create(ExecutionSession &ES, ObjectLinkingLayer &L); + + Error tryToGenerate(LookupState &LS, LookupKind K, JITDylib &JD, + JITDylibLookupFlags JDLookupFlags, + const SymbolLookupSet &Symbols) override; + +private: + DLLImportDefinitionGenerator(ExecutionSession &ES, ObjectLinkingLayer &L) + : ES(ES), L(L) {} + + static Expected getTargetPointerSize(const Triple &TT); + static Expected getTargetEndianness(const Triple &TT); + Expected> + createStubsGraph(const SymbolMap &Resolved); + + static StringRef getImpPrefix() { return "__imp_"; } + + static StringRef getSectionName() { return "$__DLLIMPORT_STUBS"; } + + ExecutionSession &ES; + ObjectLinkingLayer &L; +}; + } // end namespace orc } // end namespace llvm diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h --- a/llvm/include/llvm/IR/CFG.h +++ b/llvm/include/llvm/IR/CFG.h @@ -47,7 +47,7 @@ using pointer = Ptr *; using reference = Ptr *; -private: +protected: using Self = PredIterator; USE_iterator It; diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h --- a/llvm/include/llvm/MC/MCDwarf.h +++ b/llvm/include/llvm/MC/MCDwarf.h @@ -387,6 +387,7 @@ bool hasRootFile() const { return !Header.RootFile.Name.empty(); } + MCDwarfFile &getRootFile() { return Header.RootFile; } const MCDwarfFile &getRootFile() const { return Header.RootFile; } // Report whether MD5 usage has been consistent (all-or-none). diff --git a/llvm/include/llvm/Support/GenericDomTreeConstruction.h b/llvm/include/llvm/Support/GenericDomTreeConstruction.h --- a/llvm/include/llvm/Support/GenericDomTreeConstruction.h +++ b/llvm/include/llvm/Support/GenericDomTreeConstruction.h @@ -712,7 +712,7 @@ assert(IsPostDom && "This function is only for postdominators"); // The tree has only trivial roots -- nothing to update. - if (std::none_of(DT.Roots.begin(), DT.Roots.end(), [BUI](const NodePtr N) { + if (llvm::none_of(DT.Roots, [BUI](const NodePtr N) { return HasForwardSuccessors(N, BUI); })) return; diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h --- a/llvm/include/llvm/Transforms/IPO.h +++ b/llvm/include/llvm/Transforms/IPO.h @@ -240,10 +240,6 @@ /// devirtualization and control-flow integrity. ModulePass *createGlobalSplitPass(); -/// Write ThinLTO-ready bitcode to Str. -ModulePass *createWriteThinLTOBitcodePass(raw_ostream &Str, - raw_ostream *ThinLinkOS = nullptr); - } // End llvm namespace #endif diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -222,6 +222,8 @@ Value *optimizePuts(CallInst *CI, IRBuilderBase &B); // Helper methods + Value* emitSnPrintfMemCpy(CallInst *CI, Value *StrArg, StringRef Str, + uint64_t N, IRBuilderBase &B); Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilderBase &B); void classifyArgUse(Value *Val, Function *F, bool IsFloat, diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2868,6 +2868,11 @@ if (isImpliedCondition(LHS, RHS, Q.DL).value_or(false)) return getTrue(ITy); break; + case ICmpInst::ICMP_SLE: + /// SLE follows the same logic as SGE with the LHS and RHS swapped. + if (isImpliedCondition(RHS, LHS, Q.DL).value_or(false)) + return getTrue(ITy); + break; } return nullptr; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -242,11 +242,6 @@ cl::desc("Handle <= and >= in finite loops"), cl::init(true)); -static cl::opt UseContextForNoWrapFlagInference( - "scalar-evolution-use-context-for-no-wrap-flag-strenghening", cl::Hidden, - cl::desc("Infer nuw/nsw flags using context where suitable"), - cl::init(true)); - //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// @@ -2290,8 +2285,7 @@ } bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed, - const SCEV *LHS, const SCEV *RHS, - const Instruction *CtxI) { + const SCEV *LHS, const SCEV *RHS) { const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned); switch (BinOp) { @@ -2322,30 +2316,7 @@ const SCEV *LHSB = (this->*Extension)(LHS, WideTy, 0); const SCEV *RHSB = (this->*Extension)(RHS, WideTy, 0); const SCEV *B = (this->*Operation)(LHSB, RHSB, SCEV::FlagAnyWrap, 0); - if (A == B) - return true; - // Can we use context to prove the fact we need? - if (!CtxI) - return false; - // We can prove that add(x, constant) doesn't wrap if isKnownPredicateAt can - // guarantee that x <= max_int - constant at the given context. - // TODO: Support other operations. - if (BinOp != Instruction::Add) - return false; - auto *RHSC = dyn_cast(RHS); - // TODO: Lift this limitation. - if (!RHSC) - return false; - APInt C = RHSC->getAPInt(); - // TODO: Also lift this limitation. - if (Signed && C.isNegative()) - return false; - unsigned NumBits = C.getBitWidth(); - APInt Max = - Signed ? APInt::getSignedMaxValue(NumBits) : APInt::getMaxValue(NumBits); - APInt Limit = Max - C; - ICmpInst::Predicate Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; - return isKnownPredicateAt(Pred, LHS, getConstant(Limit), CtxI); + return A == B; } Optional @@ -2372,18 +2343,16 @@ const SCEV *LHS = getSCEV(OBO->getOperand(0)); const SCEV *RHS = getSCEV(OBO->getOperand(1)); - const Instruction *CtxI = - UseContextForNoWrapFlagInference ? dyn_cast(OBO) : nullptr; if (!OBO->hasNoUnsignedWrap() && willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(), - /* Signed */ false, LHS, RHS, CtxI)) { + /* Signed */ false, LHS, RHS)) { Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNUW); Deduced = true; } if (!OBO->hasNoSignedWrap() && willNotOverflow((Instruction::BinaryOps)OBO->getOpcode(), - /* Signed */ true, LHS, RHS, CtxI)) { + /* Signed */ true, LHS, RHS)) { Flags = ScalarEvolution::setFlags(Flags, SCEV::FlagNSW); Deduced = true; } diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp --- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -663,9 +663,10 @@ Asm->OutStreamer->emitLabel(CSRange.ExceptionLabel); // Emit the LSDA header. - // If only one call-site range exists, LPStart is omitted as it is the - // same as the function entry. - if (CallSiteRanges.size() == 1) { + // LPStart is omitted if either we have a single call-site range (in which + // case the function entry is treated as @LPStart) or if this function has + // no landing pads (in which case @LPStart is undefined). + if (CallSiteRanges.size() == 1 || LandingPadRange == nullptr) { Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart"); } else if (!Asm->isPositionIndependent()) { // For more than one call-site ranges, LPStart must be explicitly diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7754,14 +7754,14 @@ return false; // Check that GEP is used outside the block, meaning it's alive on the // IndirectBr edge(s). - if (find_if(GEPI->users(), [&](User *Usr) { + if (llvm::none_of(GEPI->users(), [&](User *Usr) { if (auto *I = dyn_cast(Usr)) { if (I->getParent() != SrcBlock) { return true; } } return false; - }) == GEPI->users().end()) + })) return false; // The second elements of the GEP chains to be unmerged. std::vector UGEPIs; diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp --- a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp @@ -193,12 +193,10 @@ "lowest stage of an interval in this LR") \ M(float, progress, {1}, "ratio of current queue size to initial size") -// The model learns to pick one of the mask == 1 interferences. This is the name -// of the output tensor. -// The contract with the model is that the output will be guaranteed to be to a -// mask == 1 position. -// Using a macro here to avoid 'not used' warnings (and keep cond compilation to -// a minimum) +// The model learns to pick one of the mask == 1 interferences. This is the +// name of the output tensor. The contract with the model is that the output +// will be guaranteed to be to a mask == 1 position. Using a macro here to +// avoid 'not used' warnings (and keep cond compilation to a minimum) #define DecisionName "index_to_evict" // Named features index. @@ -211,7 +209,8 @@ // The ML advisor will typically have a sparse input to the evaluator, because // various phys regs won't be available. It's easier (maintenance-wise) to -// bulk-reset the state of the evaluator each time we are about to use it again. +// bulk-reset the state of the evaluator each time we are about to use it +// again. template size_t getTotalSize(const std::vector &Shape) { size_t Ret = sizeof(T); for (const auto V : Shape) @@ -227,8 +226,8 @@ #undef _RESET } -// Per-live interval components that get aggregated into the feature values that -// will be passed to the evaluator. +// Per-live interval components that get aggregated into the feature values +// that will be passed to the evaluator. struct LIFeatureComponents { double R = 0; double W = 0; @@ -242,7 +241,8 @@ using CandidateRegList = std::array, NumberOfInterferences>; -using FeaturesListNormalizer = std::array; +using FeaturesListNormalizer = + llvm::SmallVector; /// The ML evictor (commonalities between release and development mode) class MLEvictAdvisor : public RegAllocEvictionAdvisor { @@ -260,10 +260,10 @@ // error, and we shouldn't be asking for it here. const MLModelRunner &getRunner() const { return *Runner; } - /// This just calls Evaluate on the Runner, but in the development mode case, - /// if we're just capturing the log of the default advisor, it needs to call - /// the latter instead, so we need to pass all the necessary parameters for - /// it. In the development case, it will also log. + /// This just calls Evaluate on the Runner, but in the development mode + /// case, if we're just capturing the log of the default advisor, it needs + /// to call the latter instead, so we need to pass all the necessary + /// parameters for it. In the development case, it will also log. virtual int64_t tryFindEvictionCandidatePosition(const LiveInterval &VirtReg, const AllocationOrder &Order, @@ -272,11 +272,11 @@ /// Load the features of the given VirtReg (allocated or not) at column Pos, /// but if that can't be evicted, return false instead. - bool - loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg, - bool IsHint, const SmallVirtRegSet &FixedRegisters, - std::array &Largest, - size_t Pos) const; + bool loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg, + bool IsHint, + const SmallVirtRegSet &FixedRegisters, + llvm::SmallVectorImpl &Largest, + size_t Pos) const; private: static float getInitialQueueSize(const MachineFunction &MF); @@ -287,11 +287,12 @@ const SmallVirtRegSet &FixedRegisters) const override; void extractFeatures(const SmallVectorImpl &Intervals, - std::array &Largest, - size_t Pos, int64_t IsHint, int64_t LocalIntfsCount, + llvm::SmallVectorImpl &Largest, size_t Pos, + int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const; - // Point-in-time: we didn't learn this, so we always delegate to the default. + // Point-in-time: we didn't learn this, so we always delegate to the + // default. bool canEvictHintInterference( const LiveInterval &VirtReg, MCRegister PhysReg, const SmallVirtRegSet &FixedRegisters) const override { @@ -303,9 +304,9 @@ getLIFeatureComponents(const LiveInterval &LI) const; // Hold on to a default advisor for: - // 1) the implementation of canEvictHintInterference, because we didn't learn - // that nuance yet; - // 2) for bootstrapping (logging) in the development mode case. + // 1) the implementation of canEvictHintInterference, because we didn't + // learn that nuance yet; 2) for bootstrapping (logging) in the development + // mode case. const DefaultEvictionAdvisor DefaultAdvisor; MLModelRunner *const Runner; const MachineBlockFrequencyInfo &MBFI; @@ -323,10 +324,6 @@ #define _DECL_FEATURES(type, name, shape, _) \ TensorSpec::createSpec(#name, shape), -static const std::vector InputFeatures{ - {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}, -}; -#undef _DECL_FEATURES // =================================== // Release (AOT) - specifics // =================================== @@ -334,13 +331,17 @@ : public RegAllocEvictionAdvisorAnalysis { public: ReleaseModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {} + : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) { + InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; + } // support for isa<> and dyn_cast. static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { return R->getAdvisorMode() == AdvisorMode::Release; } private: + std::vector InputFeatures; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); @@ -370,19 +371,12 @@ static const TensorSpec Reward = TensorSpec::createSpec("reward", {1}); // Features we bind on the model. The tensor names have a prefix, and we also -// need to include some tensors that are expected to be present by the training -// algo. +// need to include some tensors that are expected to be present by the +// training algo. // TODO: can we just get rid of these? #define _DECL_TRAIN_FEATURES(type, name, shape, _) \ TensorSpec::createSpec(std::string("action_") + #name, shape), -static const std::vector TrainingInputFeatures{ - {RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) - TensorSpec::createSpec("action_discount", {1}), - TensorSpec::createSpec("action_step_type", {1}), - TensorSpec::createSpec("action_reward", {1})}}; -#undef _DECL_TRAIN_FEATURES - class DevelopmentModeEvictAdvisor : public MLEvictAdvisor { public: DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA, @@ -404,7 +398,14 @@ : public RegAllocEvictionAdvisorAnalysis { public: DevelopmentModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {} + : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) { + InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)}; + TrainingInputFeatures = { + RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES) + TensorSpec::createSpec("action_discount", {1}), + TensorSpec::createSpec("action_step_type", {1}), + TensorSpec::createSpec("action_reward", {1})}; + } // support for isa<> and dyn_cast. static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { return R->getAdvisorMode() == AdvisorMode::Development; @@ -420,6 +421,9 @@ } private: + std::vector InputFeatures; + std::vector TrainingInputFeatures; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); @@ -486,6 +490,7 @@ std::unique_ptr Runner; StringMap> LogMap; }; + #endif //#ifdef LLVM_HAVE_TF_API } // namespace @@ -529,8 +534,8 @@ bool MLEvictAdvisor::loadInterferenceFeatures( const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint, - const SmallVirtRegSet &FixedRegisters, FeaturesListNormalizer &Largest, - size_t Pos) const { + const SmallVirtRegSet &FixedRegisters, + llvm::SmallVectorImpl &Largest, size_t Pos) const { // It is only possible to evict virtual register interference. if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) { // leave unavailable @@ -547,8 +552,8 @@ SmallVector InterferingIntervals; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); - // Different from the default heuristic, we don't make any assumptions about - // what having more than 10 results in the query may mean. + // Different from the default heuristic, we don't make any assumptions + // about what having more than 10 results in the query may mean. const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff); if (IFIntervals.empty() && InterferingIntervals.empty()) continue; @@ -605,14 +610,14 @@ // max, then any of the costs of the legally-evictable intervals // would be lower. When that happens, one of those will be selected. // Therefore, we allow the candidate be selected, unless the candidate is - // unspillable, in which case it would be incorrect to not find a register for - // it. + // unspillable, in which case it would be incorrect to not find a register + // for it. const bool MustFindEviction = (!VirtReg.isSpillable() && CostPerUseLimit == static_cast(~0u)); // Number of available candidates - if 0, no need to continue. size_t Available = 0; - // Make sure we don't have leftover partial state from an attempt where we had - // no available candidates and bailed out early. + // Make sure we don't have leftover partial state from an attempt where we + // had no available candidates and bailed out early. resetInputs(*Runner); // Track the index->register mapping because AllocationOrder doesn't do that @@ -625,15 +630,13 @@ // only normalize (some of) the float features, but it's just simpler to // dimension 'Largest' to all the features, especially since we have the // 'DoNotNormalize' list. - FeaturesListNormalizer Largest; - Largest.fill(0.0); - - // Same overal idea as in the default eviction policy - we visit the values of - // AllocationOrder one at a time. If it's not legally available, we mask off - // the corresponding feature column (==do nothing because we already reset all - // the features to 0) - // Use Pos to capture the column we load features at - in AllocationOrder - // order. + FeaturesListNormalizer Largest(FeatureIDs::FeatureCount, 0.0); + + // Same overal idea as in the default eviction policy - we visit the values + // of AllocationOrder one at a time. If it's not legally available, we mask + // off the corresponding feature column (==do nothing because we already + // reset all the features to 0) Use Pos to capture the column we load + // features at - in AllocationOrder order. size_t Pos = 0; for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E; ++I, ++Pos) { @@ -660,7 +663,8 @@ Regs[CandidateVirtRegPos].second = !MustFindEviction; if (!MustFindEviction) extractFeatures(SmallVector(1, &VirtReg), Largest, - CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0, + CandidateVirtRegPos, /*IsHint*/ 0, + /*LocalIntfsCount*/ 0, /*NrUrgent*/ 0.0); assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had " "nothing to allocate initially."); @@ -747,8 +751,8 @@ // of accummulating the various features, we keep them separate. void MLEvictAdvisor::extractFeatures( const SmallVectorImpl &Intervals, - std::array &Largest, size_t Pos, - int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const { + llvm::SmallVectorImpl &Largest, size_t Pos, int64_t IsHint, + int64_t LocalIntfsCount, float NrUrgent) const { int64_t NrDefsAndUses = 0; int64_t NrBrokenHints = 0; double R = 0.0; @@ -854,9 +858,9 @@ } else { MCRegister PhysReg = getDefaultAdvisor().tryFindEvictionCandidate( VirtReg, Order, CostPerUseLimit, FixedRegisters); - // Find the index of the selected PhysReg. We need it for logging, otherwise - // this is wasted cycles (but so would starting development mode without a - // model nor logging) + // Find the index of the selected PhysReg. We need it for logging, + // otherwise this is wasted cycles (but so would starting development mode + // without a model nor logging) if (!PhysReg) Ret = CandidateVirtRegPos; else diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -281,7 +281,6 @@ Register traceCopies(Register VirtReg) const; Register traceCopyChain(Register Reg) const; - bool shouldAllocateRegister(const Register Reg) const; int getStackSpaceFor(Register VirtReg); void spill(MachineBasicBlock::iterator Before, Register VirtReg, MCPhysReg AssignedReg, bool Kill, bool LiveOut); @@ -301,12 +300,6 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false, false) -bool RegAllocFast::shouldAllocateRegister(const Register Reg) const { - assert(Register::isVirtualRegister(Reg)); - const TargetRegisterClass &RC = *MRI->getRegClass(Reg); - return ShouldAllocateClass(*TRI, RC); -} - void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) RegUnitStates[*UI] = NewState; @@ -846,8 +839,6 @@ assert(MO.isUndef() && "expected undef use"); Register VirtReg = MO.getReg(); assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg"); - if (!shouldAllocateRegister(VirtReg)) - return; LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg); MCPhysReg PhysReg; @@ -873,8 +864,6 @@ /// (tied or earlyclobber) that may interfere with preassigned uses. void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { - if (!shouldAllocateRegister(VirtReg)) - return; LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg); if (LRI != LiveVirtRegs.end()) { MCPhysReg PrevReg = LRI->PhysReg; @@ -908,8 +897,6 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, bool LookAtPhysRegUses) { assert(VirtReg.isVirtual() && "Not a virtual register"); - if (!shouldAllocateRegister(VirtReg)) - return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -960,8 +947,6 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { assert(VirtReg.isVirtual() && "Not a virtual register"); - if (!shouldAllocateRegister(VirtReg)) - return; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -986,13 +971,8 @@ Register Hint; if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) { Hint = MI.getOperand(0).getReg(); - if (Hint.isVirtual()) { - assert(!shouldAllocateRegister(Hint)); - Hint = Register(); - } else { - assert(Hint.isPhysical() && - "Copy destination should already be assigned"); - } + assert(Hint.isPhysical() && + "Copy destination should already be assigned"); } allocVirtReg(MI, *LRI, Hint, false); if (LRI->Error) { @@ -1100,8 +1080,6 @@ assert(RegClassDefCounts.size() == TRI->getNumRegClasses()); if (Reg.isVirtual()) { - if (!shouldAllocateRegister(Reg)) - return; const TargetRegisterClass *OpRC = MRI->getRegClass(Reg); for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses(); RCIdx != RCIdxEnd; ++RCIdx) { @@ -1161,8 +1139,6 @@ if (MO.isReg()) { Register Reg = MO.getReg(); if (Reg.isVirtual()) { - if (!shouldAllocateRegister(Reg)) - continue; if (MO.isDef()) { HasDef = true; HasVRegDef = true; @@ -1226,7 +1202,7 @@ } if (MO.isDef()) { - if (Reg.isVirtual() && shouldAllocateRegister(Reg)) + if (Reg.isVirtual()) DefOperandIndexes.push_back(I); addRegClassDefCounts(RegClassDefCounts, Reg); @@ -1316,10 +1292,6 @@ Register Reg = MO.getReg(); if (!Reg) continue; - if (Reg.isVirtual()) { - assert(!shouldAllocateRegister(Reg)); - continue; - } assert(Reg.isPhysical()); if (MRI->isReserved(Reg)) continue; @@ -1366,7 +1338,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) + if (!Reg.isVirtual()) continue; if (MO.isUndef()) { @@ -1393,7 +1365,7 @@ if (!MO.isReg() || !MO.isUse()) continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) + if (!Reg.isVirtual()) continue; assert(MO.isUndef() && "Should only have undef virtreg uses left"); @@ -1416,10 +1388,6 @@ Register Reg = MO.getReg(); if (!Reg) continue; - if (Reg.isVirtual()) { - assert(!shouldAllocateRegister(Reg)); - continue; - } assert(Reg.isPhysical() && "should have register assigned"); // We sometimes get odd situations like: @@ -1449,8 +1417,6 @@ for (Register Reg : MI.getUsedDebugRegs()) { if (!Register::isVirtualRegister(Reg)) continue; - if (!shouldAllocateRegister(Reg)) - continue; // Already spilled to a stackslot? int SS = StackSlotForVirtReg[Reg]; @@ -1491,7 +1457,7 @@ continue; Register Reg = MO.getReg(); - if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) + if (!Reg.isVirtual()) continue; DenseMap::iterator DI; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13213,6 +13213,26 @@ return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); } + // Fold (iM_signext_inreg + // (extract_subvector (zext|anyext|sext iN_v to _) _) + // from iN) + // -> (extract_subvector (signext iN_v to iM)) + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && + ISD::isExtOpcode(N0.getOperand(0).getOpcode())) { + SDValue InnerExt = N0.getOperand(0); + EVT InnerExtVT = InnerExt->getValueType(0); + SDValue Extendee = InnerExt->getOperand(0); + + if (ExtVTBits == Extendee.getValueType().getScalarSizeInBits() && + (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND, InnerExtVT))) { + SDValue SignExtExtendee = + DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), InnerExtVT, Extendee); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, SignExtExtendee, + N0.getOperand(1)); + } + } + return SDValue(); } @@ -22846,25 +22866,31 @@ SDLoc DL(N); EVT IntVT = VT.changeVectorElementTypeToInteger(); EVT IntSVT = VT.getVectorElementType().changeTypeToInteger(); - IntSVT = TLI.getTypeToTransformTo(*DAG.getContext(), IntSVT); - SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT); - SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT); - SmallVector AndMask(NumElts, DAG.getUNDEF(IntSVT)); - for (int I = 0; I != (int)NumElts; ++I) - if (0 <= Mask[I]) - AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt; - - // See if a clear mask is legal instead of going via - // XformToShuffleWithZero which loses UNDEF mask elements. - if (TLI.isVectorClearMaskLegal(ClearMask, IntVT)) - return DAG.getBitcast( - VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0), - DAG.getConstant(0, DL, IntVT), ClearMask)); - - if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT)) - return DAG.getBitcast( - VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0), - DAG.getBuildVector(IntVT, DL, AndMask))); + // Transform the type to a legal type so that the buildvector constant + // elements are not illegal. Make sure that the result is larger than the + // original type, incase the value is split into two (eg i64->i32). + if (!TLI.isTypeLegal(IntSVT) && LegalTypes) + IntSVT = TLI.getTypeToTransformTo(*DAG.getContext(), IntSVT); + if (IntSVT.getSizeInBits() >= IntVT.getScalarSizeInBits()) { + SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT); + SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT); + SmallVector AndMask(NumElts, DAG.getUNDEF(IntSVT)); + for (int I = 0; I != (int)NumElts; ++I) + if (0 <= Mask[I]) + AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt; + + // See if a clear mask is legal instead of going via + // XformToShuffleWithZero which loses UNDEF mask elements. + if (TLI.isVectorClearMaskLegal(ClearMask, IntVT)) + return DAG.getBitcast( + VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0), + DAG.getConstant(0, DL, IntVT), ClearMask)); + + if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT)) + return DAG.getBitcast( + VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0), + DAG.getBuildVector(IntVT, DL, AndMask))); + } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4508,6 +4508,9 @@ return true; switch (Opcode) { + case ISD::VALUETYPE: + return true; + case ISD::UNDEF: return PoisonOnly; @@ -4564,6 +4567,8 @@ unsigned Opcode = Op.getOpcode(); switch (Opcode) { + case ISD::AssertSext: + case ISD::AssertZext: case ISD::FREEZE: case ISD::AND: case ISD::OR: @@ -4575,6 +4580,7 @@ case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::TRUNCATE: + case ISD::SIGN_EXTEND_INREG: case ISD::BITCAST: return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4733,7 +4733,8 @@ EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType()); - if (I.getAlign().value() < MemVT.getSizeInBits() / 8) + if (!TLI.supportsUnalignedAtomics() && + I.getAlign().value() < MemVT.getSizeInBits() / 8) report_fatal_error("Cannot generate unaligned atomic store"); auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout()); diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt --- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt @@ -22,6 +22,7 @@ ELF.cpp ELFLinkGraphBuilder.cpp ELF_aarch64.cpp + ELF_i386.cpp ELF_riscv.cpp ELF_x86_64.cpp @@ -33,6 +34,7 @@ # Architectures: aarch64.cpp + i386.cpp riscv.cpp x86_64.cpp diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp --- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp @@ -14,6 +14,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/ExecutionEngine/JITLink/ELF_aarch64.h" +#include "llvm/ExecutionEngine/JITLink/ELF_i386.h" #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h" #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h" #include "llvm/Object/ELF.h" @@ -71,6 +72,8 @@ return createLinkGraphFromELFObject_riscv(ObjectBuffer); case ELF::EM_X86_64: return createLinkGraphFromELFObject_x86_64(ObjectBuffer); + case ELF::EM_386: + return createLinkGraphFromELFObject_i386(ObjectBuffer); default: return make_error( "Unsupported target machine architecture in ELF object " + @@ -91,6 +94,9 @@ case Triple::x86_64: link_ELF_x86_64(std::move(G), std::move(Ctx)); return; + case Triple::x86: + link_ELF_i386(std::move(G), std::move(Ctx)); + return; default: Ctx->notifyFailed(make_error( "Unsupported target machine architecture in ELF link graph " + diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp @@ -0,0 +1,116 @@ +//===----- ELF_i386.cpp - JIT linker implementation for ELF/i386 ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// ELF/i386 jit-link implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/ELF_i386.h" +#include "ELFLinkGraphBuilder.h" +#include "JITLinkGeneric.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/ExecutionEngine/JITLink/i386.h" +#include "llvm/Object/ELFObjectFile.h" + +#define DEBUG_TYPE "jitlink" + +using namespace llvm; +using namespace llvm::jitlink; + +namespace llvm { +namespace jitlink { + +class ELFJITLinker_i386 : public JITLinker { + friend class JITLinker; + +public: + ELFJITLinker_i386(std::unique_ptr Ctx, + std::unique_ptr G, PassConfiguration PassConfig) + : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {} + +private: + Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const { + using namespace i386; + using namespace llvm::support; + + switch (E.getKind()) { + case i386::None: { + break; + } + } + return Error::success(); + } +}; + +template +class ELFLinkGraphBuilder_i386 : public ELFLinkGraphBuilder { +private: + static Expected getRelocationKind(const uint32_t Type) { + using namespace i386; + switch (Type) { + case ELF::R_386_NONE: + return EdgeKind_i386::None; + } + + return make_error("Unsupported i386 relocation:" + + formatv("{0:d}", Type)); + } + + Error addRelocations() override { + LLVM_DEBUG(dbgs() << "Adding relocations\n"); + using Base = ELFLinkGraphBuilder; + + return Error::success(); + } + +public: + ELFLinkGraphBuilder_i386(StringRef FileName, const object::ELFFile &Obj, + const Triple T) + : ELFLinkGraphBuilder(Obj, std::move(T), FileName, + i386::getEdgeKindName) {} +}; + +Expected> +createLinkGraphFromELFObject_i386(MemoryBufferRef ObjectBuffer) { + LLVM_DEBUG({ + dbgs() << "Building jitlink graph for new input " + << ObjectBuffer.getBufferIdentifier() << "...\n"; + }); + + auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer); + if (!ELFObj) + return ELFObj.takeError(); + + assert((*ELFObj)->getArch() == Triple::x86 && + "Only i386 (little endian) is supported for now"); + + auto &ELFObjFile = cast>(**ELFObj); + return ELFLinkGraphBuilder_i386((*ELFObj)->getFileName(), + ELFObjFile.getELFFile(), + (*ELFObj)->makeTriple()) + .buildGraph(); +} + +void link_ELF_i386(std::unique_ptr G, + std::unique_ptr Ctx) { + PassConfiguration Config; + const Triple &TT = G->getTargetTriple(); + if (Ctx->shouldAddDefaultTargetPasses(TT)) { + if (auto MarkLive = Ctx->getMarkLivePass(TT)) + Config.PrePrunePasses.push_back(std::move(MarkLive)); + else + Config.PrePrunePasses.push_back(markAllSymbolsLive); + } + if (auto Err = Ctx->modifyPassConfig(*G, Config)) + return Ctx->notifyFailed(std::move(Err)); + + ELFJITLinker_i386::link(std::move(Ctx), std::move(G), std::move(Config)); +} + +} // namespace jitlink +} // namespace llvm \ No newline at end of file diff --git a/llvm/lib/ExecutionEngine/JITLink/i386.cpp b/llvm/lib/ExecutionEngine/JITLink/i386.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/i386.cpp @@ -0,0 +1,30 @@ +//===---- i386.cpp - Generic JITLink i386 edge kinds, utilities -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic utilities for graphs representing i386 objects. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/i386.h" + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { +namespace i386 { + +const char *getEdgeKindName(Edge::Kind K) { + switch (K) { + case None: + return "None"; + } + return getGenericEdgeKindName(K); +} +} // namespace i386 +} // namespace jitlink +} // namespace llvm \ No newline at end of file diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/JITLink/x86_64.h" #include "llvm/ExecutionEngine/Orc/Layer.h" #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" #include "llvm/IR/Constants.h" @@ -350,7 +351,6 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate( LookupState &LS, LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) { - // Don't materialize symbols from static archives unless this is a static // lookup. if (K != LookupKind::Static) @@ -430,5 +430,121 @@ Err = buildObjectFilesMap(); } +std::unique_ptr +DLLImportDefinitionGenerator::Create(ExecutionSession &ES, + ObjectLinkingLayer &L) { + return std::unique_ptr( + new DLLImportDefinitionGenerator(ES, L)); +} + +Error DLLImportDefinitionGenerator::tryToGenerate( + LookupState &LS, LookupKind K, JITDylib &JD, + JITDylibLookupFlags JDLookupFlags, const SymbolLookupSet &Symbols) { + JITDylibSearchOrder LinkOrder; + JD.withLinkOrderDo([&](const JITDylibSearchOrder &LO) { + LinkOrder.reserve(LO.size()); + for (auto &KV : LO) { + if (KV.first == &JD) + continue; + LinkOrder.push_back(KV); + } + }); + + // FIXME: if regular symbol name start with __imp_ we have to issue lookup of + // both __imp_ and stripped name and use the lookup information to resolve the + // real symbol name. + SymbolLookupSet LookupSet; + DenseMap ToLookUpSymbols; + for (auto &KV : Symbols) { + StringRef Deinterned = *KV.first; + if (Deinterned.startswith(getImpPrefix())) + Deinterned = Deinterned.drop_front(StringRef(getImpPrefix()).size()); + // Don't degrade the required state + if (ToLookUpSymbols.count(Deinterned) && + ToLookUpSymbols[Deinterned] == SymbolLookupFlags::RequiredSymbol) + continue; + ToLookUpSymbols[Deinterned] = KV.second; + } + + for (auto &KV : ToLookUpSymbols) + LookupSet.add(ES.intern(KV.first), KV.second); + + auto Resolved = + ES.lookup(LinkOrder, LookupSet, LookupKind::DLSym, SymbolState::Resolved); + if (!Resolved) + return Resolved.takeError(); + + auto G = createStubsGraph(*Resolved); + if (!G) + return G.takeError(); + return L.add(JD, std::move(*G)); +} + +Expected +DLLImportDefinitionGenerator::getTargetPointerSize(const Triple &TT) { + switch (TT.getArch()) { + case Triple::x86_64: + return 8; + default: + return make_error( + "architecture unsupported by DLLImportDefinitionGenerator", + inconvertibleErrorCode()); + } +} + +Expected +DLLImportDefinitionGenerator::getTargetEndianness(const Triple &TT) { + switch (TT.getArch()) { + case Triple::x86_64: + return support::endianness::little; + default: + return make_error( + "architecture unsupported by DLLImportDefinitionGenerator", + inconvertibleErrorCode()); + } +} + +Expected> +DLLImportDefinitionGenerator::createStubsGraph(const SymbolMap &Resolved) { + Triple TT = ES.getExecutorProcessControl().getTargetTriple(); + auto PointerSize = getTargetEndianness(TT); + if (!PointerSize) + return PointerSize.takeError(); + auto Endianness = getTargetEndianness(TT); + if (!Endianness) + return Endianness.takeError(); + + auto G = std::make_unique( + "", TT, *PointerSize, *Endianness, + jitlink::getGenericEdgeKindName); + jitlink::Section &Sec = G->createSection( + getSectionName(), jitlink::MemProt::Read | jitlink::MemProt::Exec); + + for (auto &KV : Resolved) { + jitlink::Symbol &Target = G->addAbsoluteSymbol( + *KV.first, ExecutorAddr(KV.second.getAddress()), *PointerSize, + jitlink::Linkage::Strong, jitlink::Scope::Local, false); + + // Create __imp_ symbol + jitlink::Symbol &Ptr = + jitlink::x86_64::createAnonymousPointer(*G, Sec, &Target); + auto NameCopy = G->allocateString(Twine(getImpPrefix()) + *KV.first); + StringRef NameCopyRef = StringRef(NameCopy.data(), NameCopy.size()); + Ptr.setName(NameCopyRef); + Ptr.setLinkage(jitlink::Linkage::Strong); + Ptr.setScope(jitlink::Scope::Default); + + // Create PLT stub + // FIXME: check PLT stub of data symbol is not accessed + jitlink::Block &StubBlock = + jitlink::x86_64::createPointerJumpStubBlock(*G, Sec, Ptr); + G->addDefinedSymbol(StubBlock, 0, *KV.first, StubBlock.getSize(), + jitlink::Linkage::Strong, jitlink::Scope::Default, true, + false); + } + + return std::move(G); +} + } // End namespace orc. } // End namespace llvm. diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -412,21 +412,20 @@ // MDNode. This loop also initializes DILocationReachable, later // needed by updateLoopMetadataDebugLocationsImpl; the use of // count_if avoids an early exit. - if (!std::count_if(N->op_begin() + 1, N->op_end(), - [&Visited, &DILocationReachable](const MDOperand &Op) { - return isDILocationReachable( - Visited, DILocationReachable, Op.get()); - })) + if (llvm::none_of(llvm::drop_begin(N->operands()), + [&Visited, &DILocationReachable](const MDOperand &Op) { + return isDILocationReachable(Visited, DILocationReachable, + Op.get()); + })) return N; // If there is only the debug location without any actual loop metadata, we // can remove the metadata. - if (std::all_of( - N->op_begin() + 1, N->op_end(), - [&Visited, &DILocationReachable](const MDOperand &Op) { - return isDILocationReachable(Visited, DILocationReachable, - Op.get()); - })) + if (llvm::all_of(llvm::drop_begin(N->operands()), + [&Visited, &DILocationReachable](const MDOperand &Op) { + return isDILocationReachable(Visited, DILocationReachable, + Op.get()); + })) return nullptr; return updateLoopMetadataDebugLocationsImpl( diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -869,7 +869,7 @@ // Remap compilation directory. remapDebugPath(CompilationDir); - // Remap MCDwarfDirs in all compilation units. + // Remap MCDwarfDirs and RootFile.Name in all compilation units. SmallString<256> P; for (auto &CUIDTablePair : MCDwarfLineTablesCUMap) { for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs()) { @@ -877,6 +877,12 @@ remapDebugPath(P); Dir = std::string(P); } + + // Used by DW_TAG_compile_unit's DT_AT_name and DW_TAG_label's + // DW_AT_decl_file for DWARF v5 generated for assembly source. + P = CUIDTablePair.second.getRootFile().Name; + remapDebugPath(P); + CUIDTablePair.second.getRootFile().Name = std::string(P); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8235,15 +8235,17 @@ Swap = true; } } - // 64-bit check whether we can use CSINC. To avoid signed integer - // overflow the condition ignores wrap around, which is already - // handled by CSINV above. - } else if (1 == - std::max(TrueVal, FalseVal) - std::min(TrueVal, FalseVal)) { - Opcode = AArch64ISD::CSINC; - - if (TrueVal > FalseVal) { - Swap = true; + } else { + // 64-bit check whether we can use CSINC. + const uint64_t TrueVal64 = TrueVal; + const uint64_t FalseVal64 = FalseVal; + + if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) { + Opcode = AArch64ISD::CSINC; + + if (TrueVal > FalseVal) { + Swap = true; + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -338,9 +338,9 @@ static bool canFitIntoPipeline(SUnit &SU, ScheduleDAGInstrs *DAG, DenseSet &ConflictedInstrs) { - return std::all_of( - ConflictedInstrs.begin(), ConflictedInstrs.end(), - [DAG, &SU](SUnit *SuccSU) { return DAG->canAddEdge(SuccSU, &SU); }); + return llvm::all_of(ConflictedInstrs, [DAG, &SU](SUnit *SuccSU) { + return DAG->canAddEdge(SuccSU, &SU); + }); } void SchedGroup::initSchedGroup(std::vector::reverse_iterator RIter, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7679,10 +7679,9 @@ // extend that single value SDValue FirstOp = Op.getOperand(0); if (!isa(FirstOp) && - std::all_of(std::next(Op->op_begin()), Op->op_end(), - [&FirstOp](SDUse &U) { - return U.get().isUndef() || U.get() == FirstOp; - })) { + llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) { + return U.get().isUndef() || U.get() == FirstOp; + })) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, DAG.getValueType(MVT::i1)); return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp --- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -648,45 +648,28 @@ MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; Register DstReg = MI.getOperand(0).getReg(); - Register TmpReg = 0; // 0 for no temporary register Register SrcReg = MI.getOperand(1).getReg(); bool SrcIsKill = MI.getOperand(1).isKill(); unsigned OpLo = AVR::LDRdPtr; unsigned OpHi = AVR::LDDRdPtrQ; TRI->splitReg(DstReg, DstLoReg, DstHiReg); - // Use a temporary register if src and dst registers are the same. - if (DstReg == SrcReg) - TmpReg = scavengeGPR8(MI); - - Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; - Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; + // DstReg has an earlyclobber so the register allocator will allocate them in + // separate registers. + assert(DstReg != SrcReg && "Dst and Src registers are the same!"); // Load low byte. - auto MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(CurDstLoReg, RegState::Define) - .addReg(SrcReg); - - // Push low byte onto stack if necessary. - if (TmpReg) - buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); + buildMI(MBB, MBBI, OpLo) + .addReg(DstLoReg, RegState::Define) + .addReg(SrcReg) + .setMemRefs(MI.memoperands()); // Load high byte. - auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(CurDstHiReg, RegState::Define) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(1); - - if (TmpReg) { - // Move the high byte into the final destination. - buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg); - - // Move the low byte from the scratch space into the final destination. - buildMI(MBB, MBBI, AVR::POPRd, DstLoReg); - } - - MIBLO.setMemRefs(MI.memoperands()); - MIBHI.setMemRefs(MI.memoperands()); + buildMI(MBB, MBBI, OpHi) + .addReg(DstHiReg, RegState::Define) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(1) + .setMemRefs(MI.memoperands()); MI.eraseFromParent(); return true; @@ -763,7 +746,6 @@ MachineInstr &MI = *MBBI; Register DstLoReg, DstHiReg; Register DstReg = MI.getOperand(0).getReg(); - Register TmpReg = 0; // 0 for no temporary register Register SrcReg = MI.getOperand(1).getReg(); unsigned Imm = MI.getOperand(2).getImm(); bool SrcIsKill = MI.getOperand(1).isKill(); @@ -775,39 +757,23 @@ // highest Imm value allowed for the instruction, 62 is the limit here. assert(Imm <= 62 && "Offset is out of range"); - // Use a temporary register if src and dst registers are the same. - if (DstReg == SrcReg) - TmpReg = scavengeGPR8(MI); - - Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg; - Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg; + // DstReg has an earlyclobber so the register allocator will allocate them in + // separate registers. + assert(DstReg != SrcReg && "Dst and Src registers are the same!"); // Load low byte. - auto MIBLO = buildMI(MBB, MBBI, OpLo) - .addReg(CurDstLoReg, RegState::Define) - .addReg(SrcReg) - .addImm(Imm); - - // Push low byte onto stack if necessary. - if (TmpReg) - buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg); + buildMI(MBB, MBBI, OpLo) + .addReg(DstLoReg, RegState::Define) + .addReg(SrcReg) + .addImm(Imm) + .setMemRefs(MI.memoperands()); // Load high byte. - auto MIBHI = buildMI(MBB, MBBI, OpHi) - .addReg(CurDstHiReg, RegState::Define) - .addReg(SrcReg, getKillRegState(SrcIsKill)) - .addImm(Imm + 1); - - if (TmpReg) { - // Move the high byte into the final destination. - buildMI(MBB, MBBI, AVR::MOVRdRr, DstHiReg).addReg(TmpReg); - - // Move the low byte from the scratch space into the final destination. - buildMI(MBB, MBBI, AVR::POPRd, DstLoReg); - } - - MIBLO.setMemRefs(MI.memoperands()); - MIBHI.setMemRefs(MI.memoperands()); + buildMI(MBB, MBBI, OpHi) + .addReg(DstHiReg, RegState::Define) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(Imm + 1) + .setMemRefs(MI.memoperands()); MI.eraseFromParent(); return true; @@ -1382,8 +1348,8 @@ .addReg(DstReg, getKillRegState(DstIsKill)) .addReg(ZERO_REGISTER); - // SREG is always implicitly killed - MIB->getOperand(2).setIsKill(); + MIB->getOperand(3).setIsDead(); // SREG is always dead + MIB->getOperand(4).setIsKill(); // SREG is always implicitly killed MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -56,6 +56,7 @@ const AVRSubtarget &STI = MF.getSubtarget(); const AVRInstrInfo &TII = *STI.getInstrInfo(); const AVRMachineFunctionInfo *AFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); bool HasFP = hasFP(MF); // Interrupt handlers re-enable interrupts in function entry. @@ -68,8 +69,8 @@ // Emit special prologue code to save R1, R0 and SREG in interrupt/signal // handlers before saving any other registers. if (AFI->isInterruptOrSignalHandler()) { - BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr)) - .addReg(AVR::R1R0, RegState::Kill) + BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr)) + .addReg(AVR::R0, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0) @@ -78,11 +79,16 @@ BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr)) .addReg(AVR::R0, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr)) - .addReg(AVR::R1, RegState::Define) - .addReg(AVR::R1, RegState::Kill) - .addReg(AVR::R1, RegState::Kill) - .setMIFlag(MachineInstr::FrameSetup); + if (!MRI.reg_empty(AVR::R1)) { + BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHRr)) + .addReg(AVR::R1, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr)) + .addReg(AVR::R1, RegState::Define) + .addReg(AVR::R1, RegState::Kill) + .addReg(AVR::R1, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } } // Early exit if the frame pointer is not needed in this function. @@ -132,6 +138,7 @@ static void restoreStatusRegister(MachineFunction &MF, MachineBasicBlock &MBB) { const AVRMachineFunctionInfo *AFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -142,11 +149,14 @@ // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal // handlers at the very end of the function, just before reti. if (AFI->isInterruptOrSignalHandler()) { + if (!MRI.reg_empty(AVR::R1)) { + BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R1); + } BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0); BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr)) .addImm(STI.getIORegSREG()) .addReg(AVR::R0, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0); + BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0); } } diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -57,6 +57,8 @@ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand); + setOperationAction(ISD::INLINEASM, MVT::Other, Custom); + for (MVT VT : MVT::integer_valuetypes()) { for (auto N : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) { setLoadExtAction(N, VT, MVT::i1, Promote); @@ -836,6 +838,52 @@ MachinePointerInfo(SV)); } +// Modify the existing ISD::INLINEASM node to add the implicit register r1. +SDValue AVRTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { + SDValue R1Reg = DAG.getRegister(AVR::R1, MVT::i8); + if (Op.getOperand(Op.getNumOperands() - 1) == R1Reg || + Op.getOperand(Op.getNumOperands() - 2) == R1Reg) { + // R1 has already been added. Don't add it again. + // If this isn't handled, we get called over and over again. + return Op; + } + + // Get a list of operands to the new INLINEASM node. This is mostly a copy, + // with some edits. + // Add the following operands at the end (but before the glue node, if it's + // there): + // - The flags of the implicit R1 register operand. + // - The implicit R1 register operand itself. + SDLoc dl(Op); + SmallVector Ops; + SDNode *N = Op.getNode(); + SDValue Glue; + for (unsigned I = 0; I < N->getNumOperands(); I++) { + SDValue Operand = N->getOperand(I); + if (Operand.getValueType() == MVT::Glue) { + // The glue operand always needs to be at the end, so we need to treat it + // specially. + Glue = Operand; + } else { + Ops.push_back(Operand); + } + } + unsigned Flags = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, 1); + Ops.push_back(DAG.getTargetConstant(Flags, dl, MVT::i32)); + Ops.push_back(R1Reg); + if (Glue) { + Ops.push_back(Glue); + } + + // Replace the current INLINEASM node with a new one that has R1 as implicit + // parameter. + SDValue New = DAG.getNode(N->getOpcode(), dl, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(Op, New); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), New.getValue(1)); + + return New; +} + SDValue AVRTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: @@ -861,6 +909,8 @@ case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); + case ISD::INLINEASM: + return LowerINLINEASM(Op, DAG); } return SDValue(); @@ -1451,6 +1501,10 @@ Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); } + // The R1 register must be passed as an implicit register so that R1 is + // correctly zeroed in interrupts. + Ops.push_back(DAG.getRegister(AVR::R1, MVT::i8)); + // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = @@ -1572,6 +1626,14 @@ const AVRMachineFunctionInfo *AFI = MF.getInfo(); + if (!AFI->isInterruptOrSignalHandler()) { + // The return instruction has an implicit R1 operand: it must contain zero + // on return. + // This is not needed in interrupts however, where R1 is handled specially + // (only pushed/popped when needed). + RetOps.push_back(DAG.getRegister(AVR::R1, MVT::i8)); + } + unsigned RetOpc = AFI->isInterruptOrSignalHandler() ? AVRISD::RETI_FLAG : AVRISD::RET_FLAG; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -915,6 +915,7 @@ // neg Rd+1 // neg Rd // sbc Rd+1, r1 + let Uses = [R1] in def NEGWRd : Pseudo<(outs DREGS : $rd), (ins DREGS @@ -1986,6 +1987,7 @@ def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd", [(set i16:$rd, (AVRasrlo i16:$src)), (implicit SREG)]>; + let Uses = [R1] in def ROLBRd : Pseudo<(outs GPR8 : $rd), (ins GPR8 diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -39,8 +39,8 @@ // Do not save RA to the SCS if it's not saved to the regular stack, // i.e. RA is not at risk of being overwritten. std::vector &CSI = MF.getFrameInfo().getCalleeSavedInfo(); - if (std::none_of(CSI.begin(), CSI.end(), - [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) + if (llvm::none_of( + CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) return; Register SCSPReg = RISCVABI::getSCSPReg(); @@ -89,8 +89,8 @@ // See emitSCSPrologue() above. std::vector &CSI = MF.getFrameInfo().getCalleeSavedInfo(); - if (std::none_of(CSI.begin(), CSI.end(), - [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) + if (llvm::none_of( + CSI, [&](CalleeSavedInfo &CSR) { return CSR.getReg() == RAReg; })) return; Register SCSPReg = RISCVABI::getSCSPReg(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1104,6 +1104,8 @@ // On RV32, 64-bit integers are split into their high and low parts and held // in two different registers, so the trunc is free since the low register can // just be used. +// FIXME: Should we consider i64->i32 free on RV64 to match the EVT version of +// isTruncateFree? bool RISCVTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { if (Subtarget.is64Bit() || !SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) return false; @@ -1113,8 +1115,10 @@ } bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { - if (Subtarget.is64Bit() || SrcVT.isVector() || DstVT.isVector() || - !SrcVT.isInteger() || !DstVT.isInteger()) + // We consider i64->i32 free on RV64 since we have good selection of W + // instructions that make promoting operations back to i64 free in many cases. + if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || + !DstVT.isInteger()) return false; unsigned SrcBits = SrcVT.getSizeInBits(); unsigned DestBits = DstVT.getSizeInBits(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29735,8 +29735,22 @@ uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl: (shl V, 1) -> (add (freeze V), (freeze V)) + if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { + // R may be undef at run-time, but (shl R, 1) must be an even number (LSB + // must be 0). (add undef, undef) however can be any value. To make this + // safe, we must freeze R to ensure that register allocation uses the same + // register for an undefined value. This ensures that the result will + // still be even and preserves the original semantics. + R = DAG.getFreeze(R); + return DAG.getNode(ISD::ADD, dl, VT, R, R); + } + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -46674,20 +46688,6 @@ } } - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl. - // (shl V, 1) -> add V,V - if (auto *N1BV = dyn_cast(N1)) - if (auto *N1SplatC = N1BV->getConstantSplatNode()) { - assert(N0.getValueType().isVector() && "Invalid vector shift type"); - // We shift all of the values by one. In many cases we do not have - // hardware support for this operation. This is better expressed as an ADD - // of two values. - if (N1SplatC->isOne()) - return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); - } - return SDValue(); } @@ -47269,12 +47269,18 @@ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || - N->getOpcode() == ISD::INSERT_VECTOR_ELT) && + unsigned Opcode = N->getOpcode(); + assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || + (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || + Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"); - if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) { + // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt). + if (Opcode == ISD::INSERT_VECTOR_ELT && N->getOperand(0).isUndef() && + isNullConstant(N->getOperand(2))) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, N->getOperand(1)); + + if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) { unsigned NumBitsPerElt = VT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -546,54 +546,8 @@ writeThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash); } -class WriteThinLTOBitcode : public ModulePass { - raw_ostream &OS; // raw_ostream to print on - // The output stream on which to emit a minimized module for use - // just in the thin link, if requested. - raw_ostream *ThinLinkOS = nullptr; - -public: - static char ID; // Pass identification, replacement for typeid - WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) { - initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); - } - - explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS) - : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) { - initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); - } - - StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; } - - bool runOnModule(Module &M) override { - const ModuleSummaryIndex *Index = - &(getAnalysis().getIndex()); - writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index); - return true; - } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } -}; } // anonymous namespace -char WriteThinLTOBitcode::ID = 0; -INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode", - "Write ThinLTO Bitcode", false, true) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode", - "Write ThinLTO Bitcode", false, true) - -ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, - raw_ostream *ThinLinkOS) { - return new WriteThinLTOBitcode(Str, ThinLinkOS); -} - PreservedAnalyses llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { FunctionAnalysisManager &FAM = diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3185,6 +3185,20 @@ if (auto *II = dyn_cast(Cmp.getOperand(0))) if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C)) return I; + + // (extractval ([s/u]subo X, Y), 0) == 0 --> X == Y + // (extractval ([s/u]subo X, Y), 0) != 0 --> X != Y + // TODO: This checks one-use, but that is not strictly necessary. + Value *Cmp0 = Cmp.getOperand(0); + Value *X, *Y; + if (C->isZero() && Cmp.isEquality() && Cmp0->hasOneUse() && + (match(Cmp0, + m_ExtractValue<0>(m_Intrinsic( + m_Value(X), m_Value(Y)))) || + match(Cmp0, + m_ExtractValue<0>(m_Intrinsic( + m_Value(X), m_Value(Y)))))) + return new ICmpInst(Cmp.getPredicate(), X, Y); } if (match(Cmp.getOperand(1), m_APIntAllowUndef(C))) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -473,7 +473,8 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { // Normal constant int. ConstantInt *CI = dyn_cast(V); - if (CI || !isa(V) || !V->getType()->isPointerTy()) + if (CI || !isa(V) || !V->getType()->isPointerTy() || + DL.isNonIntegralPointerType(V->getType())) return CI; // This is some kind of pointer constant. Turn it into a pointer-sized diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2908,6 +2908,60 @@ return nullptr; } +// Transform an snprintf call CI with the bound N to format the string Str +// either to a call to memcpy, or to single character a store, or to nothing, +// and fold the result to a constant. A nonnull StrArg refers to the string +// argument being formatted. Otherwise the call is one with N < 2 and +// the "%c" directive to format a single character. +Value *LibCallSimplifier::emitSnPrintfMemCpy(CallInst *CI, Value *StrArg, + StringRef Str, uint64_t N, + IRBuilderBase &B) { + assert(StrArg || (N < 2 && Str.size() == 1)); + + unsigned IntBits = TLI->getIntSize(); + uint64_t IntMax = maxIntN(IntBits); + if (Str.size() > IntMax) + // Bail if the string is longer than INT_MAX. POSIX requires + // implementations to set errno to EOVERFLOW in this case, in + // addition to when N is larger than that (checked by the caller). + return nullptr; + + Value *StrLen = ConstantInt::get(CI->getType(), Str.size()); + if (N == 0) + return StrLen; + + // Set to the number of bytes to copy fron StrArg which is also + // the offset of the terinating nul. + uint64_t NCopy; + if (N > Str.size()) + // Copy the full string, including the terminating nul (which must + // be present regardless of the bound). + NCopy = Str.size() + 1; + else + NCopy = N - 1; + + Value *DstArg = CI->getArgOperand(0); + if (NCopy && StrArg) + // Transform the call to lvm.memcpy(dst, fmt, N). + copyFlags( + *CI, + B.CreateMemCpy( + DstArg, Align(1), StrArg, Align(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), NCopy))); + + if (N > Str.size()) + // Return early when the whole format string, including the final nul, + // has been copied. + return StrLen; + + // Otherwise, when truncating the string append a terminating nul. + Type *Int8Ty = B.getInt8Ty(); + Value *NulOff = B.getIntN(IntBits, NCopy); + Value *DstEnd = B.CreateInBoundsGEP(Int8Ty, DstArg, NulOff, "endptr"); + B.CreateStore(ConstantInt::get(Int8Ty, 0), DstEnd); + return StrLen; +} + Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilderBase &B) { // Check for size @@ -2916,78 +2970,66 @@ return nullptr; uint64_t N = Size->getZExtValue(); + uint64_t IntMax = maxIntN(TLI->getIntSize()); + if (N > IntMax) + // Bail if the bound exceeds INT_MAX. POSIX requires implementations + // to set errno to EOVERFLOW in this case. + return nullptr; + + Value *DstArg = CI->getArgOperand(0); + Value *FmtArg = CI->getArgOperand(2); + // Check for a fixed format string. StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr)) + if (!getConstantStringInfo(FmtArg, FormatStr)) return nullptr; // If we just have a format string (nothing else crazy) transform it. if (CI->arg_size() == 3) { - // Make sure there's no % in the constant array. We could try to handle - // %% -> % in the future if we cared. if (FormatStr.contains('%')) - return nullptr; // we found a format specifier, bail out. - - if (N == 0) - return ConstantInt::get(CI->getType(), FormatStr.size()); - else if (N < FormatStr.size() + 1) + // Bail if the format string contains a directive and there are + // no arguments. We could handle "%%" in the future. return nullptr; - // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, - // strlen(fmt)+1) - copyFlags( - *CI, - B.CreateMemCpy( - CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1))); // Copy the null byte. - return ConstantInt::get(CI->getType(), FormatStr.size()); + return emitSnPrintfMemCpy(CI, FmtArg, FormatStr, N, B); } // The remaining optimizations require the format string to be "%s" or "%c" // and have an extra operand. - if (FormatStr.size() == 2 && FormatStr[0] == '%' && CI->arg_size() == 4) { - - // Decode the second character of the format string. - if (FormatStr[1] == 'c') { - if (N == 0) - return ConstantInt::get(CI->getType(), 1); - else if (N == 1) - return nullptr; - - // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 - if (!CI->getArgOperand(3)->getType()->isIntegerTy()) - return nullptr; - Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char"); - Value *Ptr = castToCStr(CI->getArgOperand(0), B); - B.CreateStore(V, Ptr); - Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); - B.CreateStore(B.getInt8(0), Ptr); + if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->arg_size() != 4) + return nullptr; - return ConstantInt::get(CI->getType(), 1); + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + if (N <= 1) { + // Use an arbitary string of length 1 to transform the call into + // either a nul store (N == 1) or a no-op (N == 0) and fold it + // to one. + StringRef CharStr("*"); + return emitSnPrintfMemCpy(CI, nullptr, CharStr, N, B); } - if (FormatStr[1] == 's') { - // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1) - StringRef Str; - if (!getConstantStringInfo(CI->getArgOperand(3), Str)) - return nullptr; + // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 + if (!CI->getArgOperand(3)->getType()->isIntegerTy()) + return nullptr; + Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char"); + Value *Ptr = castToCStr(DstArg, B); + B.CreateStore(V, Ptr); + Ptr = B.CreateInBoundsGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul"); + B.CreateStore(B.getInt8(0), Ptr); + return ConstantInt::get(CI->getType(), 1); + } - if (N == 0) - return ConstantInt::get(CI->getType(), Str.size()); - else if (N < Str.size() + 1) - return nullptr; + if (FormatStr[1] != 's') + return nullptr; - copyFlags( - *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1), - CI->getArgOperand(3), Align(1), - ConstantInt::get(CI->getType(), Str.size() + 1))); + Value *StrArg = CI->getArgOperand(3); + // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1) + StringRef Str; + if (!getConstantStringInfo(StrArg, Str)) + return nullptr; - // The snprintf result is the unincremented number of bytes in the string. - return ConstantInt::get(CI->getType(), Str.size()); - } - } - return nullptr; + return emitSnPrintfMemCpy(CI, StrArg, Str, N, B); } Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -205,7 +205,7 @@ /// \returns True if the value is a constant (but not globals/constant /// expressions). static bool isConstant(Value *V) { - return isa(V) && !isa(V) && !isa(V); + return isa(V) && !isa(V); } /// Checks if \p V is one of vector-like instructions, i.e. undef, @@ -2994,7 +2994,7 @@ // okay. auto *In = BundleMember->Inst; assert(In && - (isa(In) || isa(In) || + (isa(In) || In->getNumOperands() == TE->getNumOperands()) && "Missed TreeEntry operands?"); (void)In; // fake use to avoid build failure when assertions disabled @@ -4489,7 +4489,7 @@ } else if (auto *I = dyn_cast(V)) { // Sort other instructions just by the opcodes except for CMPInst. // For CMP also sort by the predicate kind. - if ((isa(I) || isa(I)) && + if ((isa(I)) && isValidForAlternation(I->getOpcode())) { if (AllowAlternate) Key = hash_value(isa(I) ? 1 : 0); @@ -5536,8 +5536,7 @@ unsigned N = 1; Type *EltTy = T; - while (isa(EltTy) || isa(EltTy) || - isa(EltTy)) { + while (isa(EltTy)) { if (auto *ST = dyn_cast(EltTy)) { // Check that struct is homogeneous. for (const auto *Ty : ST->elements()) @@ -5867,9 +5866,9 @@ // Take credit for instruction that will become dead. if (EE->hasOneUse()) { Instruction *Ext = EE->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { + if (isa(Ext) && all_of(Ext->users(), [](User *U) { + return isa(U); + })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. Cost -= @@ -6142,18 +6141,18 @@ // Take credit for instruction that will become dead. if (EI->hasOneUse()) { Instruction *Ext = EI->user_back(); - if ((isa(Ext) || isa(Ext)) && + if (isa(Ext) && all_of(Ext->users(), [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - CommonCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + CommonCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, I); + // Add back the cost of s|zext which is subtracted separately. + CommonCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; } } CommonCost -= @@ -9001,8 +9000,8 @@ for (Instruction &In : llvm::make_early_inc_range(*BB)) { if (isDeleted(&In)) continue; - if (!isa(&In) && !isa(&In) && - !isa(&In) && !GatherShuffleSeq.contains(&In)) + if (!isa(&In) && + !GatherShuffleSeq.contains(&In)) continue; // Check if we can replace this instruction with any of the @@ -9660,17 +9659,15 @@ // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. - if (isa(I) || isa(I) || - isa(I)) + if (isa(I)) Width = std::max(Width, DL->getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an // instruction we haven't yet visited and from the same basic block as the // user or the use is a PHI node, we add it to the worklist. - else if (isa(I) || isa(I) || isa(I) || - isa(I) || isa(I) || isa(I) || - isa(I)) { + else if (isa(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) if (Visited.insert(J).second && @@ -9723,8 +9720,7 @@ break; case Instruction::ZExt: case Instruction::SExt: - if (isa(I->getOperand(0)) || - isa(I->getOperand(0))) + if (isa(I->getOperand(0))) return false; break; @@ -10083,7 +10079,7 @@ InstructionCost Cost = R.getTreeCost(); - LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); @@ -10384,6 +10380,7 @@ CandidateFound = true; MinCost = std::min(MinCost, Cost); + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", @@ -10422,8 +10419,7 @@ if (!I) return false; - if ((!isa(I) && !isa(I)) || - isa(I->getType())) + if (!isa(I) || isa(I->getType())) return false; Value *P = I->getParent(); @@ -11224,8 +11220,8 @@ InstructionCost ReductionCost = getReductionCost(TTI, VL, ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); if (!Cost.isValid()) { - LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); return nullptr; } if (Cost >= -SLPCostThreshold) { @@ -11533,8 +11529,7 @@ getInsertIndex(LastInsertInst, OperandOffset); if (!OperandIndex) return; - if (isa(InsertedOperand) || - isa(InsertedOperand)) { + if (isa(InsertedOperand)) { findBuildAggregate_rec(cast(InsertedOperand), TTI, BuildVectorOpds, InsertElts, *OperandIndex); @@ -11544,8 +11539,7 @@ } LastInsertInst = dyn_cast(LastInsertInst->getOperand(0)); } while (LastInsertInst != nullptr && - (isa(LastInsertInst) || - isa(LastInsertInst)) && + isa(LastInsertInst) && LastInsertInst->hasOneUse()); } @@ -12240,8 +12234,8 @@ // Ran into an instruction without users, like terminator, or function call // with ignored return value, store. Ignore unused instructions (basing on // instruction type, except for CallInst and InvokeInst). - if (it->use_empty() && (it->getType()->isVoidTy() || isa(it) || - isa(it))) { + if (it->use_empty() && + (it->getType()->isVoidTy() || isa(it))) { KeyNodes.insert(&*it); bool OpsChanged = false; if (ShouldStartVectorizeHorAtStore || !isa(it)) { @@ -12265,8 +12259,7 @@ } } - if (isa(it) || isa(it) || - isa(it)) + if (isa(it)) PostProcessInstructions.push_back(&*it); } diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll --- a/llvm/test/CodeGen/AArch64/arm64-csel.ll +++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll @@ -292,6 +292,32 @@ ret i64 %. } +; Regression test for FalseVal - TrueVal overflow +define i64 @foo18_overflow3(i1 %cmp) nounwind readnone optsize ssp { +; CHECK-LABEL: foo18_overflow3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: csel x0, x8, xzr, ne +; CHECK-NEXT: ret +entry: + %. = select i1 %cmp, i64 -9223372036854775808, i64 0 + ret i64 %. +} + +; Regression test for TrueVal - FalseVal overflow +define i64 @foo18_overflow4(i1 %cmp) nounwind readnone optsize ssp { +; CHECK-LABEL: foo18_overflow4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: csel x0, xzr, x8, ne +; CHECK-NEXT: ret +entry: + %. = select i1 %cmp, i64 0, i64 -9223372036854775808 + ret i64 %. +} + define i64 @foo19(i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: foo19: ; CHECK: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll --- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll +++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll @@ -54,9 +54,8 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind { ; CHECK-LABEL: sext_extract_zext_idx0: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) @@ -65,6 +64,21 @@ ret <2 x i32> %sext_inreg } +; Negative test, combine should not fire if sign extension is for a different width. +define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind { +; CHECK-LABEL: sext_extract_zext_idx0_negtest: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: shl v0.2s, v0.2s, #17 +; CHECK-NEXT: sshr v0.2s, v0.2s, #17 +; CHECK-NEXT: ret + %zext = zext <4 x i16> %vec to <4 x i32> + %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) + %sext_inreg_step0 = shl <2 x i32> %extract, + %sext_inreg = ashr <2 x i32> %sext_inreg_step0, + ret <2 x i32> %sext_inreg +} + define <4 x i16> @sext_extract_sext_idx0(<8 x i8> %vec) nounwind { ; CHECK-LABEL: sext_extract_sext_idx0: ; CHECK: // %bb.0: @@ -81,10 +95,9 @@ define <2 x i32> @sext_extract_zext_idx2(<4 x i16> %vec) nounwind { ; CHECK-LABEL: sext_extract_zext_idx2: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 2) diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -0,0 +1,337 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-apple-darwin | FileCheck %s + +define <4 x double> @test_ldnp_v4f64(<4 x double>* %A) { +; CHECK-LABEL: test_ldnp_v4f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <4 x double>, <4 x double>* %A, align 8, !nontemporal !0 + ret <4 x double> %lv +} + +define <4 x i64> @test_ldnp_v4i64(<4 x i64>* %A) { +; CHECK-LABEL: test_ldnp_v4i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <4 x i64>, <4 x i64>* %A, align 8, !nontemporal !0 + ret <4 x i64> %lv +} + +define <8 x i32> @test_ldnp_v8i32(<8 x i32>* %A) { +; CHECK-LABEL: test_ldnp_v8i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <8 x i32>, <8 x i32>* %A, align 8, !nontemporal !0 + ret <8 x i32> %lv +} + +define <8 x float> @test_ldnp_v8f32(<8 x float>* %A) { +; CHECK-LABEL: test_ldnp_v8f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <8 x float>, <8 x float>* %A, align 8, !nontemporal !0 + ret <8 x float> %lv +} + +define <16 x i16> @test_ldnp_v16i16(<16 x i16>* %A) { +; CHECK-LABEL: test_ldnp_v16i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <16 x i16>, <16 x i16>* %A, align 8, !nontemporal !0 + ret <16 x i16> %lv +} + +define <16 x half> @test_ldnp_v16f16(<16 x half>* %A) { +; CHECK-LABEL: test_ldnp_v16f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <16 x half>, <16 x half>* %A, align 8, !nontemporal !0 + ret <16 x half> %lv +} + +define <32 x i8> @test_ldnp_v32i8(<32 x i8>* %A) { +; CHECK-LABEL: test_ldnp_v32i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ret + %lv = load <32 x i8>, <32 x i8>* %A, align 8, !nontemporal !0 + ret <32 x i8> %lv +} + +define <4 x i32> @test_ldnp_v4i32(<4 x i32>* %A) { +; CHECK-LABEL: test_ldnp_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret + %lv = load<4 x i32>, <4 x i32>* %A, align 8, !nontemporal !0 + ret <4 x i32> %lv +} + +define <4 x float> @test_ldnp_v4f32(<4 x float>* %A) { +; CHECK-LABEL: test_ldnp_v4f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret + %lv = load<4 x float>, <4 x float>* %A, align 8, !nontemporal !0 + ret <4 x float> %lv +} + +define <8 x i16> @test_ldnp_v8i16(<8 x i16>* %A) { +; CHECK-LABEL: test_ldnp_v8i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret + %lv = load <8 x i16>, <8 x i16>* %A, align 8, !nontemporal !0 + ret <8 x i16> %lv +} + +define <16 x i8> @test_ldnp_v16i8(<16 x i8>* %A) { +; CHECK-LABEL: test_ldnp_v16i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret + %lv = load <16 x i8>, <16 x i8>* %A, align 8, !nontemporal !0 + ret <16 x i8> %lv +} +define <2 x double> @test_ldnp_v2f64(<2 x double>* %A) { +; CHECK-LABEL: test_ldnp_v2f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret + %lv = load <2 x double>, <2 x double>* %A, align 8, !nontemporal !0 + ret <2 x double> %lv +} + +define <2 x i32> @test_ldnp_v2i32(<2 x i32>* %A) { +; CHECK-LABEL: test_ldnp_v2i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <2 x i32>, <2 x i32>* %A, align 8, !nontemporal !0 + ret <2 x i32> %lv +} + +define <2 x float> @test_ldnp_v2f32(<2 x float>* %A) { +; CHECK-LABEL: test_ldnp_v2f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <2 x float>, <2 x float>* %A, align 8, !nontemporal !0 + ret <2 x float> %lv +} + +define <4 x i16> @test_ldnp_v4i16(<4 x i16>* %A) { +; CHECK-LABEL: test_ldnp_v4i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <4 x i16>, <4 x i16>* %A, align 8, !nontemporal !0 + ret <4 x i16> %lv +} + +define <8 x i8> @test_ldnp_v8i8(<8 x i8>* %A) { +; CHECK-LABEL: test_ldnp_v8i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <8 x i8>, <8 x i8>* %A, align 8, !nontemporal !0 + ret <8 x i8> %lv +} + +define <1 x double> @test_ldnp_v1f64(<1 x double>* %A) { +; CHECK-LABEL: test_ldnp_v1f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <1 x double>, <1 x double>* %A, align 8, !nontemporal !0 + ret <1 x double> %lv +} + +define <1 x i64> @test_ldnp_v1i64(<1 x i64>* %A) { +; CHECK-LABEL: test_ldnp_v1i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %lv = load <1 x i64>, <1 x i64>* %A, align 8, !nontemporal !0 + ret <1 x i64> %lv +} + +define <32 x i16> @test_ldnp_v32i16(<32 x i16>* %A) { +; CHECK-LABEL: test_ldnp_v32i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ret + %lv = load <32 x i16>, <32 x i16>* %A, align 8, !nontemporal !0 + ret <32 x i16> %lv +} + +define <32 x half> @test_ldnp_v32f16(<32 x half>* %A) { +; CHECK-LABEL: test_ldnp_v32f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ret + %lv = load <32 x half>, <32 x half>* %A, align 8, !nontemporal !0 + ret <32 x half> %lv +} + +define <16 x i32> @test_ldnp_v16i32(<16 x i32>* %A) { +; CHECK-LABEL: test_ldnp_v16i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ret + %lv = load <16 x i32>, <16 x i32>* %A, align 8, !nontemporal !0 + ret <16 x i32> %lv +} + +define <16 x float> @test_ldnp_v16f32(<16 x float>* %A) { +; CHECK-LABEL: test_ldnp_v16f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ret + %lv = load <16 x float>, <16 x float>* %A, align 8, !nontemporal !0 + ret <16 x float> %lv +} + +define <17 x float> @test_ldnp_v17f32(<17 x float>* %A) { +; CHECK-LABEL: test_ldnp_v17f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q1, q2, [x0, #32] +; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldr s0, [x0, #64] +; CHECK-NEXT: stp q3, q4, [x8] +; CHECK-NEXT: stp q1, q2, [x8, #32] +; CHECK-NEXT: str s0, [x8, #64] +; CHECK-NEXT: ret + %lv = load <17 x float>, <17 x float>* %A, align 8, !nontemporal !0 + ret <17 x float> %lv +} + +define <33 x double> @test_ldnp_v33f64(<33 x double>* %A) { +; CHECK-LABEL: test_ldnp_v33f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ldp q16, q17, [x0, #128] +; CHECK-NEXT: ldp q18, q19, [x0, #160] +; CHECK-NEXT: ldp q21, q22, [x0, #224] +; CHECK-NEXT: ldp q23, q24, [x0, #192] +; CHECK-NEXT: ldr d20, [x0, #256] +; CHECK-NEXT: stp q0, q1, [x8] +; CHECK-NEXT: stp q2, q3, [x8, #32] +; CHECK-NEXT: stp q4, q5, [x8, #64] +; CHECK-NEXT: str d20, [x8, #256] +; CHECK-NEXT: stp q6, q7, [x8, #96] +; CHECK-NEXT: stp q16, q17, [x8, #128] +; CHECK-NEXT: stp q18, q19, [x8, #160] +; CHECK-NEXT: stp q23, q24, [x8, #192] +; CHECK-NEXT: stp q21, q22, [x8, #224] +; CHECK-NEXT: ret + %lv = load <33 x double>, <33 x double>* %A, align 8, !nontemporal !0 + ret <33 x double> %lv +} + +define <33 x i8> @test_ldnp_v33i8(<33 x i8>* %A) { +; CHECK-LABEL: test_ldnp_v33i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldrb w9, [x0, #32] +; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: strb w9, [x8, #32] +; CHECK-NEXT: ret + %lv = load<33 x i8>, <33 x i8>* %A, align 8, !nontemporal !0 + ret <33 x i8> %lv +} + +define <4 x i65> @test_ldnp_v4i65(<4 x i65>* %A) { +; CHECK-LABEL: test_ldnp_v4i65: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0, #8] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr x10, [x0, #24] +; CHECK-NEXT: and x1, x8, #0x1 +; CHECK-NEXT: ldrb w11, [x0, #32] +; CHECK-NEXT: extr x2, x9, x8, #1 +; CHECK-NEXT: extr x4, x10, x9, #2 +; CHECK-NEXT: extr x6, x11, x10, #3 +; CHECK-NEXT: ubfx x3, x9, #1, #1 +; CHECK-NEXT: mov.d v0[1], x1 +; CHECK-NEXT: ubfx x5, x10, #2, #1 +; CHECK-NEXT: ubfx x7, x11, #3, #1 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %lv = load <4 x i65>, <4 x i65>* %A, align 8, !nontemporal !0 + ret <4 x i65> %lv +} + +define <4 x i63> @test_ldnp_v4i63(<4 x i63>* %A) { +; CHECK-LABEL: test_ldnp_v4i63: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x0, #16] +; CHECK-NEXT: extr x12, x9, x8, #63 +; CHECK-NEXT: and x0, x8, #0x7fffffffffffffff +; CHECK-NEXT: extr x9, x10, x9, #62 +; CHECK-NEXT: extr x10, x11, x10, #61 +; CHECK-NEXT: and x1, x12, #0x7fffffffffffffff +; CHECK-NEXT: and x2, x9, #0x7fffffffffffffff +; CHECK-NEXT: and x3, x10, #0x7fffffffffffffff +; CHECK-NEXT: ret + %lv = load <4 x i63>, <4 x i63>* %A, align 8, !nontemporal !0 + ret <4 x i63> %lv +} + +define <5 x double> @test_ldnp_v5f64(<5 x double>* %A) { +; CHECK-LABEL: test_ldnp_v5f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q1 +; CHECK-NEXT: ext.16b v3, v2, v2, #8 +; CHECK-NEXT: ldr d4, [x0, #32] +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q2 +; CHECK-NEXT: ; kill: def $d3 killed $d3 killed $q3 +; CHECK-NEXT: ; kill: def $d4 killed $d4 killed $q4 +; CHECK-NEXT: ret + %lv = load<5 x double>, <5 x double>* %A, align 8, !nontemporal !0 + ret <5 x double> %lv +} + +define <16 x i64> @test_ldnp_v16i64(<16 x i64>* %A) { +; CHECK-LABEL: test_ldnp_v16i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ret + %lv = load <16 x i64>, <16 x i64>* %A, align 8, !nontemporal !0 + ret <16 x i64> %lv +} + +define <16 x double> @test_ldnp_v16f64(<16 x double>* %A) { +; CHECK-LABEL: test_ldnp_v16f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q3, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ret + %lv = load <16 x double>, <16 x double>* %A, align 8, !nontemporal !0 + ret <16 x double> %lv +} + + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -575,9 +575,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov w9, v1.s[1] ; CHECK-NEXT: ldr q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -17,15 +17,11 @@ ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov w9, v1.s[1] ; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -538,9 +538,7 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: shl v2.2s, v2.2s, #16 -; CHECK-NEXT: sshr v2.2s, v2.2s, #16 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: mov w9, v2.s[1] ; CHECK-NEXT: ldr q2, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -17,9 +17,7 @@ ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: shl v2.2s, v2.2s, #16 -; CHECK-NEXT: sshr v2.2s, v2.2s, #16 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: mov w9, v2.s[1] ; CHECK-NEXT: mov v0.h[0], w8 diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -326,26 +326,16 @@ define <8 x double> @sitofp_i16_double(<8 x i16> %a) { ; CHECK-LABEL: sitofp_i16_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: shl v2.2s, v1.2s, #16 -; CHECK-NEXT: shl v3.2s, v0.2s, #16 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: sshr v2.2s, v2.2s, #16 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: sshr v3.2s, v3.2s, #16 -; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: sshll v3.2d, v3.2s, #0 +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll2 v2.2d, v1.4s, #0 +; CHECK-NEXT: sshll2 v3.2d, v0.4s, #0 ; CHECK-NEXT: sshll v4.2d, v1.2s, #0 -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: scvtf v0.2d, v3.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: scvtf v3.2d, v4.2d +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: scvtf v1.2d, v3.2d +; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf v3.2d, v2.2d +; CHECK-NEXT: scvtf v2.2d, v4.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -8,50 +8,51 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: v_mov_b32_e32 v9, v7 +; CHECK-NEXT: v_mov_b32_e32 v14, v1 +; CHECK-NEXT: v_mov_b32_e32 v13, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v3 +; CHECK-NEXT: v_mov_b32_e32 v11, v4 +; CHECK-NEXT: v_mov_b32_e32 v10, v5 +; CHECK-NEXT: v_mov_b32_e32 v9, v6 +; CHECK-NEXT: v_mov_b32_e32 v8, v7 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v14 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v12 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v10 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v8, s4, 0 -; CHECK-NEXT: v_writelane_b32 v8, s5, 1 -; CHECK-NEXT: v_writelane_b32 v8, s6, 2 -; CHECK-NEXT: v_writelane_b32 v8, s7, 3 +; CHECK-NEXT: v_writelane_b32 v16, s4, 0 +; CHECK-NEXT: v_writelane_b32 v16, s5, 1 +; CHECK-NEXT: v_writelane_b32 v16, s6, 2 +; CHECK-NEXT: v_writelane_b32 v16, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v8, s4, 4 +; CHECK-NEXT: v_writelane_b32 v16, s4, 4 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -59,16 +60,15 @@ ; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v6, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v4, v12 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v2, v14 -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v0, v16 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v5, v10 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v1, v14 +; CHECK-NEXT: v_mov_b32_e32 v0, v15 ; CHECK-NEXT: v_readfirstlane_b32 s12, v7 ; CHECK-NEXT: v_readfirstlane_b32 s10, v6 ; CHECK-NEXT: v_readfirstlane_b32 s9, v5 @@ -85,22 +85,22 @@ ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v8, s12, 5 -; CHECK-NEXT: v_writelane_b32 v8, s13, 6 -; CHECK-NEXT: v_writelane_b32 v8, s14, 7 -; CHECK-NEXT: v_writelane_b32 v8, s15, 8 -; CHECK-NEXT: v_writelane_b32 v8, s16, 9 -; CHECK-NEXT: v_writelane_b32 v8, s17, 10 -; CHECK-NEXT: v_writelane_b32 v8, s18, 11 -; CHECK-NEXT: v_writelane_b32 v8, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v0, v15 -; CHECK-NEXT: v_mov_b32_e32 v1, v16 +; CHECK-NEXT: v_writelane_b32 v16, s12, 5 +; CHECK-NEXT: v_writelane_b32 v16, s13, 6 +; CHECK-NEXT: v_writelane_b32 v16, s14, 7 +; CHECK-NEXT: v_writelane_b32 v16, s15, 8 +; CHECK-NEXT: v_writelane_b32 v16, s16, 9 +; CHECK-NEXT: v_writelane_b32 v16, s17, 10 +; CHECK-NEXT: v_writelane_b32 v16, s18, 11 +; CHECK-NEXT: v_writelane_b32 v16, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v6, v8 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v4, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v2, v12 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v0, v14 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] @@ -113,40 +113,40 @@ ; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v8, s4, 13 +; CHECK-NEXT: v_writelane_b32 v16, s4, 13 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v8, 13 -; CHECK-NEXT: v_readlane_b32 s8, v8, 5 -; CHECK-NEXT: v_readlane_b32 s9, v8, 6 -; CHECK-NEXT: v_readlane_b32 s10, v8, 7 -; CHECK-NEXT: v_readlane_b32 s11, v8, 8 -; CHECK-NEXT: v_readlane_b32 s12, v8, 9 -; CHECK-NEXT: v_readlane_b32 s13, v8, 10 -; CHECK-NEXT: v_readlane_b32 s14, v8, 11 -; CHECK-NEXT: v_readlane_b32 s15, v8, 12 -; CHECK-NEXT: v_readlane_b32 s16, v8, 0 -; CHECK-NEXT: v_readlane_b32 s17, v8, 1 -; CHECK-NEXT: v_readlane_b32 s18, v8, 2 -; CHECK-NEXT: v_readlane_b32 s19, v8, 3 +; CHECK-NEXT: v_readlane_b32 s4, v16, 13 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s8, v16, 5 +; CHECK-NEXT: v_readlane_b32 s9, v16, 6 +; CHECK-NEXT: v_readlane_b32 s10, v16, 7 +; CHECK-NEXT: v_readlane_b32 s11, v16, 8 +; CHECK-NEXT: v_readlane_b32 s12, v16, 9 +; CHECK-NEXT: v_readlane_b32 s13, v16, 10 +; CHECK-NEXT: v_readlane_b32 s14, v16, 11 +; CHECK-NEXT: v_readlane_b32 s15, v16, 12 +; CHECK-NEXT: v_readlane_b32 s16, v16, 0 +; CHECK-NEXT: v_readlane_b32 s17, v16, 1 +; CHECK-NEXT: v_readlane_b32 s18, v16, 2 +; CHECK-NEXT: v_readlane_b32 s19, v16, 3 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_readlane_b32 s4, v8, 4 +; CHECK-NEXT: v_readlane_b32 s4, v16, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: s_or_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -32,39 +32,39 @@ ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s2, -1 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_endpgm @@ -107,35 +107,35 @@ ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -172,35 +172,35 @@ ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -238,33 +238,33 @@ ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -316,48 +316,48 @@ ; GCN_DBG: ; %bb.0: ; %entry ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: ds_read_u8 v1, v1 +; GCN_DBG-NEXT: ds_read_u8 v0, v0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v1 +; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 ; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 ; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 ; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 -; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 -; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 -; GCN_DBG-NEXT: v_readlane_b32 s4, v0, 0 +; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 +; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 +; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 +; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 ; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 ; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_read_b32 v1, v1 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_read_b32 v0, v0 ; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) -; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s4 +; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 ; GCN_DBG-NEXT: s_mov_b32 m0, -1 -; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1 -; GCN_DBG-NEXT: ds_write_b32 v1, v2 +; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 +; GCN_DBG-NEXT: ds_write_b32 v0, v1 ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -420,11 +420,11 @@ ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN-O0: buffer_load_dword ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] ; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -48,9 +48,6 @@ ; VMEM: [[ENDIF]]: -; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload - ; Reload and restore exec mask ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -62,6 +59,9 @@ ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; Restore val +; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { entry: @@ -121,7 +121,6 @@ ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: [[END]]: -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -131,6 +130,7 @@ ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { @@ -187,7 +187,6 @@ ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -199,6 +198,7 @@ ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]] ; Regular spill value restored after exec modification +; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Followed by spill ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -230,7 +230,6 @@ ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] @@ -242,6 +241,7 @@ ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -13,7 +13,7 @@ ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0 + ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4) ; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1 ; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 @@ -23,7 +23,7 @@ ; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr6 ; GCN-NEXT: renamable $sgpr2 = COPY killed renamable $sgpr5 ; GCN-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr4 - ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) + ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 16 ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 15 ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 14 @@ -40,55 +40,59 @@ ; GCN-NEXT: renamable $sgpr13 = S_MOV_B32 2 ; GCN-NEXT: renamable $sgpr14 = S_MOV_B32 1 ; GCN-NEXT: renamable $sgpr15 = S_MOV_B32 0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr15 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr14 - ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr13 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr12 - ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr11 - ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr10 - ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr9 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr8 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr7 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr6 - ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr5 - ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr4 - ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr3 - ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr2 - ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr1 - ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr0 - ; GCN-NEXT: undef %28.sub0:vreg_512 = COPY [[COPY1]] - ; GCN-NEXT: %28.sub1:vreg_512 = COPY [[COPY2]] - ; GCN-NEXT: %28.sub2:vreg_512 = COPY [[COPY3]] - ; GCN-NEXT: %28.sub3:vreg_512 = COPY [[COPY4]] - ; GCN-NEXT: %28.sub4:vreg_512 = COPY [[COPY5]] - ; GCN-NEXT: %28.sub5:vreg_512 = COPY [[COPY6]] - ; GCN-NEXT: %28.sub6:vreg_512 = COPY [[COPY7]] - ; GCN-NEXT: %28.sub7:vreg_512 = COPY [[COPY8]] - ; GCN-NEXT: %28.sub8:vreg_512 = COPY [[COPY9]] - ; GCN-NEXT: %28.sub9:vreg_512 = COPY [[COPY10]] - ; GCN-NEXT: %28.sub10:vreg_512 = COPY [[COPY11]] - ; GCN-NEXT: %28.sub11:vreg_512 = COPY [[COPY12]] - ; GCN-NEXT: %28.sub12:vreg_512 = COPY [[COPY13]] - ; GCN-NEXT: %28.sub13:vreg_512 = COPY [[COPY14]] - ; GCN-NEXT: %28.sub14:vreg_512 = COPY [[COPY15]] - ; GCN-NEXT: %28.sub15:vreg_512 = COPY [[COPY16]] + ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr15 + ; GCN-NEXT: renamable $vgpr30 = COPY killed renamable $sgpr14 + ; GCN-NEXT: renamable $vgpr29 = COPY killed renamable $sgpr13 + ; GCN-NEXT: renamable $vgpr28 = COPY killed renamable $sgpr12 + ; GCN-NEXT: renamable $vgpr27 = COPY killed renamable $sgpr11 + ; GCN-NEXT: renamable $vgpr26 = COPY killed renamable $sgpr10 + ; GCN-NEXT: renamable $vgpr25 = COPY killed renamable $sgpr9 + ; GCN-NEXT: renamable $vgpr24 = COPY killed renamable $sgpr8 + ; GCN-NEXT: renamable $vgpr23 = COPY killed renamable $sgpr7 + ; GCN-NEXT: renamable $vgpr22 = COPY killed renamable $sgpr6 + ; GCN-NEXT: renamable $vgpr21 = COPY killed renamable $sgpr5 + ; GCN-NEXT: renamable $vgpr20 = COPY killed renamable $sgpr4 + ; GCN-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr3 + ; GCN-NEXT: renamable $vgpr18 = COPY killed renamable $sgpr2 + ; GCN-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $vgpr16 = COPY killed renamable $sgpr0 + ; GCN-NEXT: undef renamable $vgpr0 = COPY killed renamable $vgpr0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: renamable $vgpr1 = COPY killed renamable $vgpr30 + ; GCN-NEXT: renamable $vgpr2 = COPY killed renamable $vgpr29 + ; GCN-NEXT: renamable $vgpr3 = COPY killed renamable $vgpr28 + ; GCN-NEXT: renamable $vgpr4 = COPY killed renamable $vgpr27 + ; GCN-NEXT: renamable $vgpr5 = COPY killed renamable $vgpr26 + ; GCN-NEXT: renamable $vgpr6 = COPY killed renamable $vgpr25 + ; GCN-NEXT: renamable $vgpr7 = COPY killed renamable $vgpr24 + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr23 + ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr22 + ; GCN-NEXT: renamable $vgpr10 = COPY killed renamable $vgpr21 + ; GCN-NEXT: renamable $vgpr11 = COPY killed renamable $vgpr20 + ; GCN-NEXT: renamable $vgpr12 = COPY killed renamable $vgpr19 + ; GCN-NEXT: renamable $vgpr13 = COPY killed renamable $vgpr18 + ; GCN-NEXT: renamable $vgpr14 = COPY killed renamable $vgpr17 + ; GCN-NEXT: renamable $vgpr15 = COPY killed renamable $vgpr16 + ; GCN-NEXT: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5) ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) - ; GCN-NEXT: dead %45:vgpr_32 = COPY [[DEF]] - ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec - ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec + ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) + ; GCN-NEXT: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec + ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %28, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec - ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]] + ; GCN-NEXT: renamable $vgpr0 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec + ; GCN-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1 - ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) + ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) ; GCN-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc ; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN-NEXT: {{ $}} @@ -99,8 +103,9 @@ ; GCN-NEXT: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]], killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1) + ; GCN-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -227,14 +227,14 @@ ; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}} ; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}} -; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec +; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -251,7 +251,7 @@ ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -270,10 +270,10 @@ ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -297,10 +297,10 @@ ; W64-O0: s_xor_b64 exec, exec, [[SAVE]] ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] -; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] ; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] +; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill ; W64-O0: [[TERMBB]]: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -19,10 +19,10 @@ ; CHECK-NEXT: v_writelane_b32 v40, s33, 2 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, _ZL13sleep_foreverv@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, _ZL13sleep_foreverv@gotpcrel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -191,23 +191,23 @@ ; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory. ; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr: -; GCN: v_writelane_b32 v{{[0-9]+}}, s34, 0 -; GCN: v_writelane_b32 v{{[0-9]+}}, s35, 1 -; GCN: v_writelane_b32 v{{[0-9]+}}, s36, 2 -; GCN: v_writelane_b32 v{{[0-9]+}}, s37, 3 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 +; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0 +; GCN: buffer_store_dword [[A]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0 +; GCN: buffer_store_dword [[B]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0 +; GCN: buffer_store_dword [[C]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0 +; GCN: buffer_store_dword [[D]], off, s[0:3], s32 ; GCN: #ASMEND -; GCN: buffer_load_dword v{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}} -; GCN: v_readlane_b32 s37, v{{[0-9]+}}, 3 -; GCN: v_readlane_b32 s36, v{{[0-9]+}}, 2 -; GCN: v_readlane_b32 s35, v{{[0-9]+}}, 1 -; GCN: v_readlane_b32 s34, v{{[0-9]+}}, 0 +; GCN: buffer_load_dword [[E:v[0-9]+]] +; GCN: v_readlane_b32 s37, [[E]], 0 +; GCN: buffer_load_dword [[F:v[0-9]+]] +; GCN: v_readlane_b32 s36, [[F]], 0 +; GCN: buffer_load_dword [[G:v[0-9]+]] +; GCN: v_readlane_b32 s35, [[G]], 0 +; GCN: buffer_load_dword [[H:v[0-9]+]] +; GCN: v_readlane_b32 s34, [[H]], 0 define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %a = load <4 x i32>, <4 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -133,7 +133,7 @@ ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -144,18 +144,18 @@ ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] -; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 +; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 +; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -165,23 +165,23 @@ ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 +; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 5 ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -194,20 +194,19 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 +; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 3 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s34, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 @@ -216,7 +215,7 @@ ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll --- a/llvm/test/CodeGen/ARM/vector-store.ll +++ b/llvm/test/CodeGen/ARM/vector-store.ll @@ -419,3 +419,20 @@ store <3 x i8> zeroinitializer, <3 x i8> *%p, align 4 ret void } + +define void @v3i64shuffle(<3 x i64> *%p, <3 x i64> %a) { +; CHECK-LABEL: v3i64shuffle: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: ldrd r12, r1, [sp, #8] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vorr d19, d16, d16 +; CHECK-NEXT: str r1, [r0, #20] +; CHECK-NEXT: vst1.32 {d18, d19}, [r0]! +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr + %b = shufflevector <3 x i64> %a, <3 x i64> zeroinitializer, <3 x i32> + store <3 x i64> %b, <3 x i64> *%p, align 4 + ret void +} + diff --git a/llvm/test/CodeGen/AVR/interrupts.ll b/llvm/test/CodeGen/AVR/interrupts.ll --- a/llvm/test/CodeGen/AVR/interrupts.ll +++ b/llvm/test/CodeGen/AVR/interrupts.ll @@ -1,18 +1,16 @@ ; RUN: llc < %s -march=avr | FileCheck %s @count = global i8 0 +@funcptr = global void () addrspace(1)* null define avr_intrcc void @interrupt_handler() { ; CHECK-LABEL: interrupt_handler: ; CHECK: sei ; CHECK-NEXT: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti ret void @@ -22,13 +20,10 @@ ; CHECK-LABEL: interrupt_handler_via_ir_attribute: ; CHECK: sei ; CHECK-NEXT: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti ret void @@ -38,13 +33,10 @@ ; CHECK-LABEL: signal_handler: ; CHECK-NOT: sei ; CHECK: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti ret void @@ -54,13 +46,10 @@ ; CHECK-LABEL: signal_handler_via_attribute: ; CHECK-NOT: sei ; CHECK: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti ret void @@ -70,10 +59,8 @@ ; CHECK-LABEL: interrupt_alloca: ; CHECK: sei ; CHECK-NEXT: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK: push r28 ; CHECK-NEXT: push r29 ; CHECK-NEXT: in r28, 61 @@ -94,7 +81,6 @@ ; CHECK-NEXT: pop r28 ; CHECK: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti alloca i8 @@ -104,10 +90,8 @@ define void @signal_handler_with_increment() #1 { ; CHECK-LABEL: signal_handler_with_increment: ; CHECK: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 -; CHECK-NEXT: clr r1 ; CHECK-NEXT: push r24 ; CHECK-NEXT: lds r24, count ; CHECK-NEXT: inc r24 @@ -115,7 +99,6 @@ ; CHECK-NEXT: pop r24 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti %old = load volatile i8, i8* @count @@ -124,6 +107,29 @@ ret void } +; Check that r1 is saved/restored and set to 0 when using inline assembly. +define void @signal_handler_with_asm() #1 { +; CHECK-LABEL: signal_handler_with_asm: +; CHECK: push r0 +; CHECK-NEXT: in r0, 63 +; CHECK-NEXT: push r0 +; CHECK-NEXT: push r1 +; CHECK-NEXT: clr r1 +; CHECK-NEXT: push r24 +; CHECK-NEXT: ldi +; ;APP +; CHECK: mov +; ;NO_APP +; CHECK: pop r24 +; CHECK-NEXT: pop r1 +; CHECK-NEXT: pop r0 +; CHECK-NEXT: out 63, r0 +; CHECK-NEXT: pop r0 +; CHECK-NEXT: reti + call i8 asm sideeffect "mov $0, $1", "=r,r"(i8 3) nounwind + ret void +} + declare void @foo() ; When a signal handler calls a function, it must push/pop all call clobbered @@ -131,9 +137,9 @@ define void @signal_handler_with_call() #1 { ; CHECK-LABEL: signal_handler_with_call: ; CHECK: push r0 -; CHECK-NEXT: push r1 ; CHECK-NEXT: in r0, 63 ; CHECK-NEXT: push r0 +; CHECK-NEXT: push r1 ; CHECK-NEXT: clr r1 ; CHECK-NEXT: push r18 ; CHECK-NEXT: push r19 @@ -160,14 +166,58 @@ ; CHECK-NEXT: pop r20 ; CHECK-NEXT: pop r19 ; CHECK-NEXT: pop r18 +; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: out 63, r0 -; CHECK-NEXT: pop r1 ; CHECK-NEXT: pop r0 ; CHECK-NEXT: reti call void @foo() ret void } +define void @signal_handler_with_icall() #1 { +; CHECK-LABEL: signal_handler_with_icall: +; CHECK: push r0 +; CHECK-NEXT: in r0, 63 +; CHECK-NEXT: push r0 +; CHECK-NEXT: push r1 +; CHECK-NEXT: clr r1 +; CHECK-NEXT: push r18 +; CHECK-NEXT: push r19 +; CHECK-NEXT: push r20 +; CHECK-NEXT: push r21 +; CHECK-NEXT: push r22 +; CHECK-NEXT: push r23 +; CHECK-NEXT: push r24 +; CHECK-NEXT: push r25 +; CHECK-NEXT: push r26 +; CHECK-NEXT: push r27 +; CHECK-NEXT: push r30 +; CHECK-NEXT: push r31 +; CHECK-NEXT: lds r30, funcptr +; CHECK-NEXT: lds r31, funcptr+1 +; CHECK-NEXT: icall +; CHECK-NEXT: pop r31 +; CHECK-NEXT: pop r30 +; CHECK-NEXT: pop r27 +; CHECK-NEXT: pop r26 +; CHECK-NEXT: pop r25 +; CHECK-NEXT: pop r24 +; CHECK-NEXT: pop r23 +; CHECK-NEXT: pop r22 +; CHECK-NEXT: pop r21 +; CHECK-NEXT: pop r20 +; CHECK-NEXT: pop r19 +; CHECK-NEXT: pop r18 +; CHECK-NEXT: pop r1 +; CHECK-NEXT: pop r0 +; CHECK-NEXT: out 63, r0 +; CHECK-NEXT: pop r0 +; CHECK-NEXT: reti + %ptr = load volatile void() addrspace(1)*, void() addrspace(1)** @funcptr + call void %ptr() + ret void +} + attributes #0 = { "interrupt" } attributes #1 = { "signal" } diff --git a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir b/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AVR/pseudo/LDWRdPtr-same-src-dst.mir +++ /dev/null @@ -1,30 +0,0 @@ -# RUN: llc -O0 %s -o - | FileCheck %s - -# This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction. - ---- | - target triple = "avr--" - define void @test_ldwrdptr() { - entry: - ret void - } -... - ---- -name: test_ldwrdptr -tracksRegLiveness: true -body: | - bb.0.entry: - liveins: $r31r30 - - ; CHECK-LABEL: test_ldwrdptr - - ; CHECK: ld [[SCRATCH:r[0-9]+]], Z - ; CHECK-NEXT: push [[SCRATCH]] - ; CHECK-NEXT: ldd [[SCRATCH]], Z+1 - ; CHECK-NEXT: mov r31, [[SCRATCH]] - ; CHECK-NEXT: pop r30 - - early-clobber $r31r30 = LDWRdPtr undef $r31r30 -... - diff --git a/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir b/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir --- a/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir +++ b/llvm/test/CodeGen/AVR/pseudo/NEGWRd.mir @@ -22,5 +22,5 @@ ; CHECK-NEXT: $r14 = NEGRd $r14 ; CHECK-NEXT: $r15 = SBCRdRr $r15, $r1, implicit-def $sreg, implicit killed $sreg - $r15r14 = NEGWRd $r15r14, implicit-def $sreg + $r15r14 = NEGWRd $r15r14, implicit-def $sreg, implicit $r1 ... diff --git a/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir b/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AVR/pseudo/ROLBrd.mir @@ -0,0 +1,25 @@ +# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s + +# This test checks the expansion of the 8-bit ROLB (rotate) pseudo instruction. + +--- | + target triple = "avr--" + define void @test_rolbrd() { + entry: + ret void + } +... + +--- +name: test_rolbrd +body: | + bb.0.entry: + liveins: $r14 + + ; CHECK-LABEL: test_rolbrd + + ; CHECK: $r14 = ADDRdRr killed $r14, killed $r14, implicit-def $sreg + ; CHECK-NEXT: $r14 = ADCRdRr $r14, $r1, implicit-def dead $sreg, implicit killed $sreg + + $r14 = ROLBRd $r14, implicit-def $sreg, implicit $r1 +... diff --git a/llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll b/llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll rename from llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll rename to llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll --- a/llvm/test/CodeGen/AVR/unaligned-atomic-loads.ll +++ b/llvm/test/CodeGen/AVR/unaligned-atomic-ops.ll @@ -1,6 +1,6 @@ ; RUN: llc -mattr=addsubiw < %s -march=avr | FileCheck %s -; This verifies that the middle end can handle an unaligned atomic load. +; This verifies that the backend can handle an unaligned atomic load and store. ; ; In the past, an assertion inside the SelectionDAGBuilder would always ; hit an assertion for unaligned loads and stores. @@ -14,6 +14,7 @@ start: %a = getelementptr inbounds %AtomicI16, %AtomicI16* %self, i16 0, i32 0, i32 0 load atomic i16, i16* %a seq_cst, align 1 + store atomic i16 5, i16* %a seq_cst, align 1 ret void } diff --git a/llvm/test/CodeGen/AVR/zeroreg.ll b/llvm/test/CodeGen/AVR/zeroreg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AVR/zeroreg.ll @@ -0,0 +1,27 @@ +; RUN: llc -mattr=avr6,sram < %s -march=avr | FileCheck %s + +; This file tests whether the compiler correctly works with the r1 register, +; clearing it when needed. + +; Test regular use of r1 as a zero register. +; CHECK-LABEL: store8zero: +; CHECK: st {{[XYZ]}}, r1 +; CHECK-NEXT: mov r24, r1 +; CHECK-NEXT: ret +define i8 @store8zero(i8* %x) { + store i8 0, i8* %x + ret i8 0 +} + +; Test that mulitplication instructions (mul, muls, etc) clobber r1 and require +; a "clr r1" instruction. +; CHECK-LABEL: mul: +; CHECK: muls +; CHECK-NEXT: clr r1 +; CHECK-NEXT: st {{[XYZ]}}, r0 +; CHECK-NEXT: ret +define void @mul(i8* %ptr, i8 %n) { + %result = mul i8 %n, 3 + store i8 %result, i8* %ptr + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll deleted file mode 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll +++ /dev/null @@ -1,138 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 -; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 -; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 - -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 -; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 -; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 -; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ -; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 - -define <16 x i8> @test_4_8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr { -; CHECK-LE-P8-LABEL: test_4_8: -; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 -; CHECK-LE-P8-NEXT: addi r3, r5, .LCPI0_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs2, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, f1 -; CHECK-LE-P8-NEXT: xxswapd v4, vs2 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 -; CHECK-LE-P8-NEXT: blr -; -; CHECK-LE-P9-LABEL: test_4_8: -; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: lxv v4, 0(r3) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 -; CHECK-LE-P9-NEXT: blr -; -; CHECK-BE-P8-LABEL: test_4_8: -; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-BE-P8-NEXT: addi r3, r5, .LCPI0_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-BE-P8-NEXT: blr -; -; CHECK-BE-P9-LABEL: test_4_8: -; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-BE-P9-NEXT: lxv v4, 0(r3) -; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-BE-P9-NEXT: blr -; -; CHECK-AIX-64-P8-LABEL: test_4_8: -; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C0(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-64-P8-NEXT: blr -; -; CHECK-AIX-64-P9-LABEL: test_4_8: -; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C0(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) -; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-64-P9-NEXT: blr -; -; CHECK-AIX-32-P8-LABEL: test_4_8: -; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r4) -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C0(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw v3, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-32-P8-NEXT: blr -; -; CHECK-AIX-32-P9-LABEL: test_4_8: -; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxmrghw v3, vs1, vs0 -; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-32-P9-NEXT: blr -entry: - %0 = load <4 x i8>, ptr %a, align 4 - %bc1 = bitcast <4 x i8> %0 to i32 - %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0 - %1 = load <8 x i8>, ptr %b, align 8 - %bc2 = bitcast <8 x i8> %1 to i64 - %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0 - %2 = bitcast <4 x i32> %vecinit3 to <16 x i8> - %3 = bitcast <2 x i64> %vecinit6 to <16 x i8> - %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> - ret <16 x i8> %shuffle -} diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -0,0 +1,2090 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 + +define <16 x i8> @test_v16i8_v16i8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v16i8_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lbz r3, 0(r3) +; CHECK-LE-P8-NEXT: lbz r4, 0(r4) +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-BE-P8-NEXT: lbz r4, 0(r4) +; CHECK-BE-P8-NEXT: lbz r3, 0(r3) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI0_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-BE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-BE-P9-NEXT: lxsibzx v4, 0, r3 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI0_0@toc@l +; CHECK-BE-P9-NEXT: lxv v2, 0(r5) +; CHECK-BE-P9-NEXT: vperm v2, v4, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lbz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: lbz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: ld r5, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxsibzx v4, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r5) +; CHECK-AIX-64-P9-NEXT: vperm v2, v4, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lbz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: lbz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r5, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxsibzx v4, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r5) +; CHECK-AIX-32-P9-NEXT: vperm v2, v4, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <1 x i8>, ptr %a, align 4 + %bc1 = bitcast <1 x i8> %0 to i8 + %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0 + %1 = load <1 x i8>, ptr %b, align 8 + %bc2 = bitcast <1 x i8> %1 to i8 + %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0 + %2 = bitcast <16 x i8> %vecinit3 to <16 x i8> + %3 = bitcast <16 x i8> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v16i8_none(<16 x i8> %a, i8 %b) { +; CHECK-LE-P8-LABEL: test_v16i8_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrd v4, r5 +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-LE-P9-NEXT: vinsertb v2, v3, 15 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v4, r5 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P9-NEXT: vinsertb v2, v3, 0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P9-NEXT: vinsertb v2, v3, 0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-32-P9-NEXT: vinsertb v2, v3, 0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %vecins = insertelement <16 x i8> %a, i8 %b, i32 0 + ret <16 x i8> %vecins +} + +define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v4, r3 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtvsrd v3, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C2(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C1(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs = load <16 x i8>, ptr %b, align 4 + %rhs = insertelement <16 x i8> undef, i8 %arg, i32 0 + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) { +; CHECK-LE-P8-LABEL: test_v16i8_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r4, r4, 56 +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: mtvsrd v2, r4 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r4, r4, 56 +; CHECK-BE-P9-NEXT: sldi r3, r3, 48 +; CHECK-BE-P9-NEXT: mtvsrd v2, r4 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -32 +; CHECK-AIX-32-P8-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) { +; CHECK-LE-P8-LABEL: test_v8i16_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r4, r4, 56 +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: mtvsrd v2, r4 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r4, r4, 56 +; CHECK-BE-P9-NEXT: sldi r3, r3, 48 +; CHECK-BE-P9-NEXT: mtvsrd v2, r4 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -32 +; CHECK-AIX-32-P8-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v4, r3 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtvsrd v3, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C3(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs = load <16 x i8>, ptr %b, align 4 + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_v8i16_none(<8 x i16> %a, i16 %b) { +; CHECK-LE-P8-LABEL: test_v8i16_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrd v4, r5 +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI6_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-LE-P9-NEXT: vinserth v2, v3, 14 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v4, r5 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI6_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P9-NEXT: vinserth v2, v3, 0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C4(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P9-NEXT: vinserth v2, v3, 0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C2(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-32-P9-NEXT: vinserth v2, v3, 0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %vecins = insertelement <8 x i16> %a, i16 %b, i32 0 + ret <8 x i16> %vecins +} + +define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> %b) { +; CHECK-LE-P8-LABEL: test_v16i8_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtvsrws v3, r4 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: sldi r4, r4, 32 +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: mtvsrd v3, r4 +; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtvsrws v3, r4 +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) { +; CHECK-LE-P8-LABEL: test_v4i32_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: mtvsrws v3, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r4, r4, 56 +; CHECK-BE-P8-NEXT: sldi r3, r3, 32 +; CHECK-BE-P8-NEXT: mtvsrd v2, r4 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r4, r4, 56 +; CHECK-BE-P9-NEXT: mtvsrws v3, r3 +; CHECK-BE-P9-NEXT: mtvsrd v2, r4 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r3 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -32 +; CHECK-AIX-32-P8-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <4 x i32> @test_none_v4i32(<4 x i32> %a, i64 %b) { +; CHECK-LE-P8-LABEL: test_none_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrwz v4, r5 +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI9_1@toc@ha +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI9_1@toc@l +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprwz f0, r5 +; CHECK-LE-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-LE-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v4, r5 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI9_1@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI9_1@toc@l +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r5 +; CHECK-BE-P9-NEXT: xxinsertw v2, vs0, 4 +; CHECK-BE-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C6(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxinsertw v2, vs0, 4 +; CHECK-AIX-64-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C4(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %conv = trunc i64 %b to i32 + %vecins = insertelement <4 x i32> %a, i32 %conv, i32 1 + %vecins2 = insertelement <4 x i32> %vecins, i32 %conv, i32 3 + ret <4 x i32> %vecins2 +} + +define <16 x i8> @test_v4i32_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v4i32_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI10_0@toc@ha +; CHECK-LE-P8-NEXT: lbzx r4, 0, r4 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI10_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha +; CHECK-LE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vspltb v3, v3, 7 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lbzx r4, 0, r4 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI10_0@toc@ha +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: addi r4, r5, .LCPI10_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha +; CHECK-BE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vspltb v3, v3, 7 +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lbzx r4, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lbzx r4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C5(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i8>, ptr %a, align 4 + %bc1 = bitcast <4 x i8> %0 to i32 + %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0 + %1 = load <1 x i8>, ptr %b, align 8 + %bc2 = bitcast <1 x i8> %1 to i8 + %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0 + %2 = bitcast <4 x i32> %vecinit3 to <16 x i8> + %3 = bitcast <16 x i8> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> %b) { +; CHECK-LE-P8-LABEL: test_v16i8_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: mtvsrd v3, r4 +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtvsrd v3, r4 +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <2 x i64> %b, i64 %arg1, i32 0 + %rhs = bitcast <2 x i64> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) { +; CHECK-LE-P8-LABEL: test_v2i64_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r4, r4, 56 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: mtvsrd v2, r4 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r4, r4, 56 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 +; CHECK-BE-P9-NEXT: mtvsrd v2, r4 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: stb r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %rhs = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0 + %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define dso_local <16 x i8> @test_1_2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_1_2: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI13_0@toc@ha +; CHECK-LE-P8-NEXT: lbzx r3, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI13_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_1_2: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI13_0@toc@ha +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI13_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_1_2: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lbzx r3, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_1_2: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_1_2: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lbzx r3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_1_2: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_1_2: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lbzx r3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C6(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_1_2: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <1 x i8>, ptr %a, align 4 + %bc1 = bitcast <1 x i8> %0 to i8 + %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <16 x i8> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_none_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI14_0@toc@ha +; CHECK-LE-P8-NEXT: lbzx r3, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI14_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI14_0@toc@ha +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI14_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lbzx r3, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lbzx r3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lbzx r3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C7(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsibzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <1 x i8>, ptr %a, align 4 + %bc1 = bitcast <1 x i8> %0 to i8 + %vecinit3 = insertelement <16 x i8> poison, i8 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <16 x i8> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v2i64_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI15_0@toc@ha +; CHECK-LE-P8-NEXT: lbzx r4, 0, r4 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI15_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-LE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vspltb v3, v3, 7 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lbzx r4, 0, r4 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: vspltb v3, v3, 7 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lbzx r4, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lbzx r4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <8 x i8>, ptr %a, align 4 + %bc1 = bitcast <8 x i8> %0 to i64 + %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0 + %1 = load <1 x i8>, ptr %b, align 8 + %bc2 = bitcast <1 x i8> %1 to i8 + %vecinit6 = insertelement <16 x i8> undef, i8 %bc2, i64 0 + %2 = bitcast <2 x i64> %vecinit3 to <16 x i8> + %3 = bitcast <16 x i8> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) { +; CHECK-LE-P8-LABEL: test_v8i16_v8i16rhs: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v8i16rhs: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtvsrd v2, r3 +; CHECK-LE-P9-NEXT: mtvsrd v3, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v8i16rhs: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI16_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI16_0@toc@l +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v8i16rhs: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI16_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P9-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI16_0@toc@l +; CHECK-BE-P9-NEXT: lxv v2, 0(r5) +; CHECK-BE-P9-NEXT: vperm v2, v4, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16rhs: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16rhs: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: ld r5, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r5) +; CHECK-AIX-64-P9-NEXT: vperm v2, v4, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16rhs: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16rhs: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8> + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %arg1) { +; CHECK-LE-P8-LABEL: test_v8i16_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r7 +; CHECK-LE-P8-NEXT: mtfprd f1, r8 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r7 +; CHECK-LE-P9-NEXT: mtvsrws v3, r8 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r7, 48 +; CHECK-BE-P8-NEXT: sldi r4, r8, 32 +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: mtvsrd v3, r4 +; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r7, 48 +; CHECK-BE-P9-NEXT: mtvsrws v3, r8 +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> %a, i16 %arg, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %arg1) { +; CHECK-LE-P8-LABEL: test_v8i16_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r7 +; CHECK-LE-P8-NEXT: mtfprd f1, r8 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r7 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r8 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r7, 48 +; CHECK-BE-P8-NEXT: mtvsrd v3, r8 +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r7, 48 +; CHECK-BE-P9-NEXT: mtvsrd v3, r8 +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> %a, i16 %arg, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <2 x i64> %b, i64 %arg1, i32 0 + %rhs = bitcast <2 x i64> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v4i32_v4i32(i32 %arg, i32 %arg1, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-P8-LABEL: test_v4i32_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> %a, i32 %arg, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) { +; CHECK-LE-P8-LABEL: test_v4i32_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: mtvsrws v2, r3 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 32 +; CHECK-BE-P8-NEXT: sldi r4, r4, 48 +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: mtvsrd v3, r4 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrws v2, r3 +; CHECK-BE-P9-NEXT: sldi r3, r4, 48 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r4 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrws v2, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v2i64_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <8 x i8>, ptr %a, align 4 + %bc1 = bitcast <8 x i8> %0 to i64 + %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <2 x i64> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32> %b) { +; CHECK-LE-P8-LABEL: test_v2i64_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrglw v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxmrglw v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: sldi r3, r4, 32 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghw v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghw v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <2 x i64> %a, i64 %arg, i32 0 + %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <4 x i32> %b, i32 %arg1, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) { +; CHECK-LE-P8-LABEL: test_v2i64_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: sldi r3, r4, 48 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: sth r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0 + %lhs = bitcast <2 x i64> %lhs.tmp to <16 x i8> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <16 x i8> + %shuffle = shufflevector <16 x i8> %lhs, <16 x i8> %rhs, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v4i32_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI24_0@toc@ha +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 +; CHECK-LE-P8-NEXT: addi r3, r5, .LCPI24_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs2, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, f1 +; CHECK-LE-P8-NEXT: xxswapd v4, vs2 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l +; CHECK-LE-P9-NEXT: xxswapd v2, f0 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI24_0@toc@ha +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r3, r5, .LCPI24_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r4) +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw v3, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxmrghw v3, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i8>, ptr %a, align 4 + %bc1 = bitcast <4 x i8> %0 to i32 + %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0 + %1 = load <8 x i8>, ptr %b, align 8 + %bc2 = bitcast <8 x i8> %1 to i64 + %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0 + %2 = bitcast <4 x i32> %vecinit3 to <16 x i8> + %3 = bitcast <2 x i64> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll @@ -0,0 +1,1909 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 + +define <2 x i64> @test_v16i8_v16i8(i8 %arg1, i8 %arg) { +; CHECK-LE-P8-LABEL: test_v16i8_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <16 x i8> undef, i8 %arg, i32 0 + %rhs = bitcast <16 x i8> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v16i8_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v16i8_v8i16(i8 %arg1, i16 %arg) { +; CHECK-LE-P8-LABEL: test_v16i8_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: sldi r4, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: sldi r3, r4, 48 +; CHECK-BE-P9-NEXT: mtfprd f1, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v8i16_v16i8(i8 %arg1, i16 %arg) { +; CHECK-LE-P8-LABEL: test_v8i16_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: sldi r4, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: sldi r3, r4, 48 +; CHECK-BE-P9-NEXT: mtfprd f1, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v8i16_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v16i8_v4i32(i8 %arg1, i32 %arg) { +; CHECK-LE-P8-LABEL: test_v16i8_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: sldi r4, r4, 32 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v4i32_v16i8(i8 %arg1, i32 %arg) { +; CHECK-LE-P8-LABEL: test_v4i32_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: sldi r4, r4, 32 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_none_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v4i32_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64> + %rhs = load <2 x i64>, ptr %b, align 4 + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v16i8_v2i64(i8 %arg1, i64 %arg) { +; CHECK-LE-P8-LABEL: test_v16i8_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v16i8_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v16i8_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 56 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v16i8_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 56 +; CHECK-BE-P9-NEXT: mtfprd f1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v16i8_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v16i8_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v16i8_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r5, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C1(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0 + %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) { +; CHECK-LE-P8-LABEL: test_v2i64_v16i8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v16i8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v16i8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprd f0, r4 +; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v16i8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtvsrdd v2, r4, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v16i8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v16i8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r4, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v16i8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -48(r1) +; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 + %lhs = bitcast <16 x i8> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0 + %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_none_v2i64(ptr nocapture noundef readonly %b, i64 %arg) { +; CHECK-LE-P8-LABEL: test_none_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r3) +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-BE-P8-NEXT: mtfprd f0, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r3) +; CHECK-BE-P9-NEXT: mtfprd f0, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r6, L..C2(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v4, v2 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs = load <2 x i64>, ptr %b, align 4 + %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0 + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) { +; CHECK-LE-P8-LABEL: test_v2i64_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxv v2, 0(r3) +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprd f0, r4 +; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-BE-P8-NEXT: xxspltd v3, vs0, 0 +; CHECK-BE-P8-NEXT: xxmrghd v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxv v2, 0(r3) +; CHECK-BE-P9-NEXT: mtvsrdd v3, r4, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 +; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v3, vs0, vs0 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: mtvsrdd v3, r4, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs = load <2 x i64>, ptr %b, align 4 + %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0 + %shuffle = shufflevector <2 x i64> %rhs, <2 x i64> %lhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v8i16_v8i16(i16 %arg1, i16 %arg) { +; CHECK-LE-P8-LABEL: test_v8i16_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v8i16_v4i32(i16 %arg1, i32 %arg) { +; CHECK-LE-P8-LABEL: test_v8i16_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: sldi r4, r4, 32 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 48 +; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 32 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v8i16_v2i64(i16 %arg1, i64 %arg) { +; CHECK-LE-P8-LABEL: test_v8i16_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: sldi r3, r3, 48 +; CHECK-BE-P9-NEXT: mtfprd f1, r4 +; CHECK-BE-P9-NEXT: mtfprd f0, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r5, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C5(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 + %lhs = bitcast <8 x i16> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <2 x i64> undef, i64 %arg, i32 0 + %rhs = bitcast <2 x i64> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v4i32_v4i32(i32 %arg1, i32 %arg) { +; CHECK-LE-P8-LABEL: test_v4i32_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v4i32_v8i16(i32 %arg1, i16 %arg) { +; CHECK-LE-P8-LABEL: test_v4i32_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: mtvsrws vs0, r3 +; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 32 +; CHECK-BE-P8-NEXT: sldi r4, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrws vs0, r3 +; CHECK-BE-P9-NEXT: sldi r3, r4, 48 +; CHECK-BE-P9-NEXT: mtfprd f1, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 +; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrws vs0, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) { +; CHECK-LE-P8-LABEL: test_v4i32_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: mtvsrws vs0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: sldi r3, r3, 32 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtvsrws vs0, r3 +; CHECK-BE-P9-NEXT: mtfprd f1, r4 +; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtvsrws vs0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C6(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 + %lhs = bitcast <4 x i32> %lhs.tmp to <2 x i64> + %rhs = insertelement <2 x i64> undef, i64 %arg, i32 0 + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LE-P8-LABEL: test_v2i64_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: ld r3, 0(r3) +; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: xxmrghd v3, vs0, vs1 +; CHECK-LE-P8-NEXT: vaddudm v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: ld r3, 0(r3) +; CHECK-LE-P9-NEXT: lfd f1, 0(r4) +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v3, vs1, vs0 +; CHECK-LE-P9-NEXT: vaddudm v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: lfdx f0, 0, r4 +; CHECK-BE-P8-NEXT: xxmrghd v3, v2, vs0 +; CHECK-BE-P8-NEXT: vaddudm v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: lfd f0, 0(r4) +; CHECK-BE-P9-NEXT: xxmrghd v3, v2, vs0 +; CHECK-BE-P9-NEXT: vaddudm v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v3, v2, vs0 +; CHECK-AIX-64-P8-NEXT: vaddudm v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxmrghd v3, v2, vs0 +; CHECK-AIX-64-P9-NEXT: vaddudm v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r3) +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -32 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 4(r4) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -64 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stw r3, -64(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -48 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs3, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs3, vs2 +; CHECK-AIX-32-P8-NEXT: xxmrghd v3, v2, vs0 +; CHECK-AIX-32-P8-NEXT: vaddudm v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r5, 4(r3) +; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -64(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -64(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghd v3, v2, vs0 +; CHECK-AIX-32-P9-NEXT: vaddudm v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <8 x i8>, ptr %a, align 8 + %bc1 = bitcast <8 x i8> %0 to i64 + %vecinit3 = insertelement <2 x i64> poison, i64 %bc1, i64 0 + %1 = load <8 x i8>, ptr %b, align 8 + %bc2 = bitcast <8 x i8> %1 to i64 + %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0 + %2 = bitcast <2 x i64> %vecinit3 to <2 x i64> + %3 = bitcast <2 x i64> %vecinit6 to <2 x i64> + %shuffle = shufflevector <2 x i64> %2, <2 x i64> %3, <2 x i32> + %4 = add <2 x i64> %shuffle, %2 + ret <2 x i64> %4 +} + +define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) { +; CHECK-LE-P8-LABEL: test_v2i64_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtvsrws vs0, r4 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 +; CHECK-BE-P8-NEXT: mtfprwz f0, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r4 +; CHECK-BE-P9-NEXT: mtvsrdd v2, r3, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r3, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -48 +; CHECK-AIX-32-P8-NEXT: stw r5, -48(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <2 x i64> undef, i64 %arg1, i32 0 + %lhs = bitcast <2 x i64> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <4 x i32> undef, i32 %arg, i32 0 + %rhs = bitcast <4 x i32> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + +define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) { +; CHECK-LE-P8-LABEL: test_v2i64_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprd f0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, vs0 +; CHECK-LE-P9-NEXT: mtfprd f0, r4 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 +; CHECK-BE-P8-NEXT: mtfprwz f0, r4 +; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mtfprwz f0, r4 +; CHECK-BE-P9-NEXT: mtvsrdd v2, r3, r3 +; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r3, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -48 +; CHECK-AIX-32-P8-NEXT: sth r5, -48(r1) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) +; CHECK-AIX-32-P9-NEXT: sth r5, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %lhs.tmp = insertelement <2 x i64> undef, i64 %arg1, i32 0 + %lhs = bitcast <2 x i64> %lhs.tmp to <2 x i64> + %rhs.tmp = insertelement <8 x i16> undef, i16 %arg, i32 0 + %rhs = bitcast <8 x i16> %rhs.tmp to <2 x i64> + %shuffle = shufflevector <2 x i64> %lhs, <2 x i64> %rhs, <2 x i32> + ret <2 x i64> %shuffle +} + diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll @@ -0,0 +1,1445 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 + +define void @test_none_v8i16(ptr %a) { +; CHECK-LE-P8-LABEL: test_none_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: mtvsrd v4, r4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsd v3, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: xxswapd vs0, v2 +; CHECK-LE-P9-NEXT: stfd f0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-BE-P9-NEXT: stfd f0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-AIX-64-P9-NEXT: stfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: stxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i32>, ptr %a + %tmp1_1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> + %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_1, <2 x i32> + store <2 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v8i16_none(ptr %a) { +; CHECK-LE-P8-LABEL: test_v8i16_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <4 x i32>, ptr %a, align 1 + %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %1, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { +; CHECK-LE-P8-LABEL: test_none_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-LE-P8-NEXT: mffprwz r3, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: li r3, 0 +; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 +; CHECK-LE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: stxv v2, 0(r5) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: xxsldwi vs0, v2, v2, 3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: mffprwz r4, f0 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: li r3, 0 +; CHECK-BE-P9-NEXT: vextuwlx r3, r3, v2 +; CHECK-BE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r5) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mffprwz r5, f0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: li r4, 0 +; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P9-NEXT: ld r4, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lwz r4, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = extractelement <2 x i32> %vec, i64 0 + %1 = bitcast i32 %0 to <2 x i16> + %2 = shufflevector <2 x i16> %1, <2 x i16> %1, <8 x i32> + %3 = shufflevector <2 x i32> %vec, <2 x i32> %vec, <4 x i32> + %4 = bitcast <4 x i32> %3 to <8 x i16> + %5 = shufflevector <8 x i16> %4, <8 x i16> %2, <8 x i32> + store <8 x i16> %5, ptr %ptr1, align 16 + ret void +} + +define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) { +; CHECK-LE-P8-LABEL: test_v4i32_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r3 +; CHECK-LE-P8-NEXT: mffprwz r3, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, vs1 +; CHECK-LE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: li r3, 0 +; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 +; CHECK-LE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: stxv v2, 0(r5) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: xxsldwi vs0, v2, v2, 3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: mffprwz r4, f0 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: li r3, 0 +; CHECK-BE-P9-NEXT: vextuwlx r3, r3, v2 +; CHECK-BE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r5) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mffprwz r5, f0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: li r4, 0 +; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P9-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-64-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = extractelement <2 x i32> %vec, i64 0 + %1 = bitcast i32 %0 to <2 x i16> + %2 = shufflevector <2 x i16> %1, <2 x i16> %1, <8 x i32> + %3 = shufflevector <2 x i32> %vec, <2 x i32> %vec, <4 x i32> + %4 = bitcast <4 x i32> %3 to <8 x i16> + %5 = shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> + store <8 x i16> %5, ptr %ptr1, align 16 + ret void +} + +define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_addr #0 { +; CHECK-LE-P8-LABEL: test_none_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI4_1@toc@ha +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI4_1@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; +; CHECK-LE-P9-LABEL: test_none_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v3, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-LE-P9-NEXT: mtfprwz f0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-LE-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; +; CHECK-BE-P8-LABEL: test_none_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI4_1@toc@ha +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI4_1@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; +; CHECK-BE-P9-LABEL: test_none_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v3, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-BE-P9-NEXT: mtfprwz f0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-BE-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; +; CHECK-AIX-64-P8-LABEL: test_none_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C2(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C3(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; +; CHECK-AIX-64-P9-LABEL: test_none_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; +; CHECK-AIX-32-P8-LABEL: test_none_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C2(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v5, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v5, v2, v4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; +; CHECK-AIX-32-P9-LABEL: test_none_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +entry: + %0 = load <2 x i32>, ptr %ptr, align 4 + %tmp = insertelement <2 x i32> %vec, i32 %v1, i32 0 + %1 = shufflevector <2 x i32> %0, <2 x i32> %tmp, <4 x i32> + store <4 x i32> %1, ptr undef, align 4 + unreachable +} + +define void @test_v2i64_none() { +; CHECK-LE-P8-LABEL: test_v2i64_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i32>, ptr undef, align 4 + %1 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> + store <4 x i32> %1, ptr undef, align 4 + ret void +} + +define void @test_v8i16_v8i16(ptr %a) { +; CHECK-LE-P8-LABEL: test_v8i16_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: mtfprwz f0, r4 +; CHECK-BE-P8-NEXT: mtfprwz f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P8-NEXT: mtfprwz f1, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i8>, ptr %a, align 1 + %tmp1_1 = bitcast <2 x i8> %1 to i16 + %tmp1_2 = insertelement <8 x i16> undef, i16 %tmp1_1, i32 0 + %tmp1_3 = bitcast <8 x i16> %tmp1_2 to <4 x i32> + %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_3, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v8i16_v4i32(ptr %a) { +; CHECK-LE-P8-LABEL: test_v8i16_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i16>, ptr %a, align 4 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_2, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v8i16_v2i64(ptr %a) { +; CHECK-LE-P8-LABEL: test_v8i16_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i16>, ptr %a, align 8 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp0_3, <4 x i32> %tmp1_2, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { +; CHECK-LE-P8-LABEL: test_v4i32_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-LE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-BE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C4(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C4(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %load1 = load <4 x i8>, ptr %a + %load2 = load <4 x i8>, ptr %b + %shuffle1 = shufflevector <4 x i8> %load1, <4 x i8> %load2, <8 x i32> + %shuffle2 = shufflevector <8 x i8> %shuffle1, <8 x i8> %shuffle1, <16 x i32> + ret <16 x i8> %shuffle2 +} + +define void @test_v4i32_v8i16(ptr %a) { +; CHECK-LE-P8-LABEL: test_v4i32_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i16>, ptr %a, align 4 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_3, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v4i32_v2i64(ptr %a) { +; CHECK-LE-P8-LABEL: test_v4i32_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, f1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: xxswapd vs1, f1 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i16>, ptr undef, align 8 + %tmp0_1 = bitcast <2 x i16> %0 to i32 + %tmp0_2 = insertelement <4 x i32> undef, i32 %tmp0_1, i32 0 + %1 = load <2 x i16>, ptr %a, align 4 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_2, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v2i64_v2i64(ptr %a) { +; CHECK-LE-P8-LABEL: test_v2i64_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r3 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: lfd f1, 0(r3) +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: lfdx f1, 0, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: lfd f1, 0(r3) +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfdx f1, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lfd f1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r3) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f2, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs2, 1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r3) +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i32>, ptr undef, align 4 + %1 = load <2 x i32>, ptr %a, align 4 + %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v2i64_v4i32(ptr %a) { +; CHECK-LE-P8-LABEL: test_v2i64_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, f1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: xxswapd vs1, f1 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxsldwi vs1, f1, f1, 1 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i16>, ptr undef, align 8 + %tmp0_1 = bitcast <2 x i16> %0 to i32 + %tmp0_2 = insertelement <4 x i32> undef, i32 %tmp0_1, i32 0 + %1 = load <2 x i16>, ptr %a, align 4 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp0_2, <4 x i32> %tmp1_2, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} + +define void @test_v2i64_v8i16(ptr %a) { +; CHECK-LE-P8-LABEL: test_v2i64_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxswapd vs0, f0 +; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-BE-P8-NEXT: sldi r3, r4, 48 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr undef, align 1 + %tmp0_1 = bitcast <2 x i8> %0 to i16 + %tmp0_2 = insertelement <8 x i16> undef, i16 %tmp0_1, i32 0 + %tmp0_3 = bitcast <8 x i16> %tmp0_2 to <4 x i32> + %1 = load <2 x i16>, ptr %a, align 8 + %tmp1_1 = bitcast <2 x i16> %1 to i32 + %tmp1_2 = insertelement <4 x i32> undef, i32 %tmp1_1, i32 0 + %2 = shufflevector <4 x i32> %tmp1_2, <4 x i32> %tmp0_3, <4 x i32> + store <4 x i32> %2, ptr undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -0,0 +1,1554 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 + +define void @test_none_v8i16(ptr %a0, ptr %a1, <16 x i8> %a, <8 x i16> %b, i8 %arg) { +; CHECK-LE-P8-LABEL: test_none_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v4, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %load0.tmp = load <2 x i8>, ptr %a0 + %load0.tmp1 = bitcast <2 x i8> %load0.tmp to i16 + %load0 = insertelement <8 x i16> %b, i16 %load0.tmp1, i64 0 + %load1.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0 + %load1 = bitcast <16 x i8> %load1.tmp to <8 x i16> + %shuff = shufflevector <8 x i16> %load0, <8 x i16> %load1, <8 x i32> + store <8 x i16> %shuff, ptr undef + ret void +} + +define void @test_v8i16_none(ptr %a0, ptr %a1, <16 x i8> %a, <8 x i16> %b, i8 %arg) { +; CHECK-LE-P8-LABEL: test_v8i16_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrd v4, r9 +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI1_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI1_1@toc@l +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: mtvsrd v4, r3 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-LE-P9-NEXT: mtvsrwz v4, r9 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-LE-P9-NEXT: vinsertb v2, v4, 15 +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v4, r9 +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI1_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI1_1@toc@l +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v4, r9 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-BE-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C2(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C1(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C2(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-32-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %load0.tmp = load <2 x i8>, ptr %a0 + %load0.tmp1 = bitcast <2 x i8> %load0.tmp to i16 + %load0 = insertelement <8 x i16> %b, i16 %load0.tmp1, i64 0 + %load1.tmp = insertelement <16 x i8> %a, i8 %arg, i32 0 + %load1 = bitcast <16 x i8> %load1.tmp to <8 x i16> + %shuff = shufflevector <8 x i16> %load0, <8 x i16> %load1, <8 x i32> + store <8 x i16> %shuff, ptr undef + ret void +} + +define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 { +; CHECK-LE-P8-LABEL: test_none_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: mtvsrd v3, r5 +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-LE-P9-NEXT: mtvsrd v3, r5 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l +; CHECK-LE-P9-NEXT: vperm v3, v3, v3, v4 +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: xxswapd vs0, v2 +; CHECK-LE-P9-NEXT: stfd f0, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: stxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v3, r5 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l +; CHECK-BE-P9-NEXT: vperm v3, v3, v3, v4 +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: stxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C3(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r5 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C4(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: stxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r5 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.1 +; CHECK-AIX-64-P9-NEXT: vperm v3, v3, v3, v4 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: stxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: stb r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: vmrghh v3, v3, v3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, -12(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: stb r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vmrghh v3, v3, v3 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -12(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i16>, ptr %ptr, align 4 + %tmp = insertelement <4 x i8> undef, i8 %v3, i32 0 + %tmp0 = bitcast <4 x i8> %tmp to <2 x i16> + %1 = shufflevector <2 x i16> %0, <2 x i16> %tmp0, <4 x i32> + store <4 x i16> %1, ptr undef, align 4 + ret void +} + +define void @test_v4i32_none(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { +; CHECK-LE-P8-LABEL: test_v4i32_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI3_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-LE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-LE-P9-NEXT: lxv v3, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI3_0@toc@ha +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI3_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P9-NEXT: lxv v3, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-64-P9-NEXT: lxv v3, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C4(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-32-P9-NEXT: lxv v3, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i16>, ptr %ptr1, align 1 + %1 = load <2 x i16>, ptr %ptr2, align 1 + %shuffle1 = shufflevector <2 x i16> %0, <2 x i16> %1, <4 x i32> + %2 = zext <4 x i16> %shuffle1 to <4 x i32> + store <4 x i32> %2, ptr undef, align 16 + ret void +} + +define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { +; CHECK-LE-P8-LABEL: test_none_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs1 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_none_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-LE-P9-NEXT: lxv v3, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI4_1@toc@l +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: lxv v3, 0(r3) +; CHECK-LE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_none_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_none_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-BE-P9-NEXT: lxv v3, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_none_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C6(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_none_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_none_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C5(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_none_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv v3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i16>, ptr %ptr1, align 1 + %1 = load <4 x i32>, ptr %ptr2, align 1 + %bc = trunc <4 x i32> %1 to <4 x i16> + %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> %bc, <4 x i32> + %2 = zext <4 x i16> %shuffle1 to <4 x i32> + store <4 x i32> %2, ptr undef, align 16 + ret void +} + +define void @test_v2i64_none(ptr nocapture readonly %ptr1) { +; CHECK-LE-P8-LABEL: test_v2i64_none: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_none: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; CHECK-LE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l +; CHECK-LE-P9-NEXT: lxv v3, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_none: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI5_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_none: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha +; CHECK-BE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l +; CHECK-BE-P9-NEXT: lxv v3, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_none: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r4, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_none: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-64-P9-NEXT: lxv v3, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_none: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, 4(r3) +; CHECK-AIX-32-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C6(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_none: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r3) +; CHECK-AIX-32-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i16>, ptr %ptr1, align 1 + %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> undef, <4 x i32> + %1 = zext <4 x i16> %shuffle1 to <4 x i32> + store <4 x i32> %1, ptr undef, align 16 + ret void +} + +define <16 x i8> @test_v8i16_v8i16(ptr %a, ptr %b) { +; CHECK-LE-P8-LABEL: test_v8i16_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI6_0@toc@ha +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI6_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v4, r4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI6_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI6_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI6_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v3, v4, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI6_0@toc@ha +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI6_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C7(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C6(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %load1 = load <2 x i8>, ptr %a + %load2 = load <2 x i8>, ptr %b + %shuffle1 = shufflevector <2 x i8> %load1, <2 x i8> %load2, <8 x i32> + %shuffle2 = shufflevector <8 x i8> %shuffle1, <8 x i8> %shuffle1, <16 x i32> + ret <16 x i8> %shuffle2 +} + +define <16 x i8> @test_v8i16_v4i32(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v8i16_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, f0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 4 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v8i16_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, f0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v8i16_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v8i16_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r4 +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v8i16_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define void @test_v4i32_v4i32(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { +; CHECK-LE-P8-LABEL: test_v4i32_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI9_1@toc@ha +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI9_1@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-LE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI9_1@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI9_1@toc@l +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: lxv v3, 0(r3) +; CHECK-LE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; CHECK-BE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C7(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i16>, ptr %ptr1, align 1 + %1 = load <2 x i16>, ptr %ptr2, align 1 + %shuffle1 = shufflevector <2 x i16> %0, <2 x i16> %1, <4 x i32> + %2 = zext <4 x i16> %shuffle1 to <4 x i32> + store <4 x i32> %2, ptr undef, align 16 + ret void +} + +define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v4i32_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, f0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 4 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v4i32_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, f1 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v4i32_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, f0 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v4i32_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v4i32_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a, align 4 + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} + +define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { +; CHECK-LE-P8-LABEL: test_v2i64_v2i64: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI12_0@toc@ha +; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI12_1@toc@ha +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI12_0@toc@l +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI12_1@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v2i64: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI12_1@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI12_1@toc@l +; CHECK-LE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P9-NEXT: lxv v3, 0(r3) +; CHECK-LE-P9-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v2i64: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI12_0@toc@ha +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI12_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v2i64: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI12_0@toc@ha +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI12_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C10(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i16>, ptr %ptr1, align 1 + %1 = load <4 x i16>, ptr %ptr2, align 1 + %shuffle1 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> + %2 = zext <4 x i16> %shuffle1 to <4 x i32> + store <4 x i32> %2, ptr undef, align 16 + ret void +} + +define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v2i64_v4i32: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 +; CHECK-LE-P8-NEXT: xxswapd v2, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, f1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v4i32: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: xxswapd v2, f0 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v4i32: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v4i32: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C11(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C10(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a, align 4 + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_v2i64_v8i16: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, f0 +; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_v2i64_v8i16: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_v2i64_v8i16: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: lxsdx v2, 0, r4 +; CHECK-BE-P8-NEXT: sldi r3, r3, 48 +; CHECK-BE-P8-NEXT: mtvsrd v3, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_v2i64_v8i16: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 +; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <2 x i8>, ptr %a + %bc1 = bitcast <2 x i8> %0 to i16 + %vecinit3 = insertelement <8 x i16> poison, i16 %bc1, i64 0 + %1 = load <2 x i8>, ptr %b, align 8 + %bc2 = bitcast <2 x i8> %1 to i16 + %vecinit6 = insertelement <8 x i16> undef, i16 %bc2, i64 0 + %2 = bitcast <8 x i16> %vecinit3 to <16 x i8> + %3 = bitcast <8 x i16> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %3, <16 x i8> %2, <16 x i32> + ret <16 x i8> %shuffle +} diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -727,9 +727,9 @@ ; ; RV64ZBB-LABEL: zext_abs32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: sext.w a0, a0 -; RV64ZBB-NEXT: negw a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sext.w a1, a0 +; RV64ZBB-NEXT: negw a0, a0 +; RV64ZBB-NEXT: max a0, a1, a0 ; RV64ZBB-NEXT: ret ; ; RV64ZBT-LABEL: zext_abs32: diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -9,8 +9,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ctlz_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a1, a0 -; RV64I-NEXT: beqz a1, .LBB0_2 +; RV64I-NEXT: beqz a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill @@ -63,8 +62,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-LABEL: log2_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a1, a0 -; RV64I-NEXT: beqz a1, .LBB1_2 +; RV64I-NEXT: beqz a0, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill @@ -368,34 +366,34 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_i32: ; RV64I: # %bb.0: +; RV64I-NEXT: beqz a0, .LBB6_4 +; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: sext.w s0, a0 -; RV64I-NEXT: beqz s0, .LBB6_3 -; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: neg a1, a0 -; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: and a0, s0, a0 ; RV64I-NEXT: lui a1, 30667 ; RV64I-NEXT: addiw a1, a1, 1329 ; RV64I-NEXT: call __muldi3@plt ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: beqz s0, .LBB6_4 +; RV64I-NEXT: beqz s0, .LBB6_3 ; RV64I-NEXT: # %bb.2: # %cond.false ; RV64I-NEXT: srliw a0, a1, 27 ; RV64I-NEXT: lui a1, %hi(.LCPI6_0) ; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0) ; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: lbu a0, 0(a0) -; RV64I-NEXT: j .LBB6_4 -; RV64I-NEXT: .LBB6_3: -; RV64I-NEXT: li a0, 32 -; RV64I-NEXT: .LBB6_4: # %cond.end +; RV64I-NEXT: .LBB6_3: # %cond.false ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret +; RV64I-NEXT: .LBB6_4: +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: cttz_i32: ; RV64ZBB: # %bb.0: @@ -928,7 +926,7 @@ define signext i32 @abs_i32_sext(i32 signext %x) { ; RV64I-LABEL: abs_i32_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: sraiw a1, a0, 31 +; RV64I-NEXT: srai a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1725,17 +1725,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: addi sp, sp, -16 -; RV64ZVE32F-NEXT: .cfi_def_cfa_offset 16 -; RV64ZVE32F-NEXT: sw a1, 12(sp) -; RV64ZVE32F-NEXT: sw a0, 8(sp) -; RV64ZVE32F-NEXT: addi a0, sp, 12 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; RV64ZVE32F-NEXT: vle32.v v9, (a0) -; RV64ZVE32F-NEXT: addi a0, sp, 8 -; RV64ZVE32F-NEXT: vle32.v v8, (a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vmv.v.x v8, a1 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, mu +; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 ; RV64ZVE32F-NEXT: andi a1, a0, 1 @@ -1744,7 +1737,6 @@ ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB24_4 ; RV64ZVE32F-NEXT: .LBB24_2: # %else2 -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB24_3: # %cond.store ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu @@ -1755,7 +1747,6 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v8, (a3) -; RV64ZVE32F-NEXT: addi sp, sp, 16 ; RV64ZVE32F-NEXT: ret %tval = trunc <2 x i64> %val to <2 x i32> call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %tval, <2 x i32*> %ptrs, i32 4, <2 x i1> %m) diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen2/i64/g' %s | llc -mtriple=riscv32 -mattr=+m | \ +; RUN: FileCheck %s --check-prefix=RV32 +; RUN: sed 's/iXLen2/i128/g' %s | llc -mtriple=riscv64 -mattr=+m | \ +; RUN: FileCheck %s --check-prefix=RV64 + +define iXLen2 @test_udiv_3(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_3: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 3 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_3: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 3 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 3 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_5(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_5: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 5 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_5: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 5 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 5 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_7(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_7: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 7 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_7: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 7 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 7 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_9(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_9: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 9 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_9: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 9 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 9 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_15(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_15: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 15 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_15: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 15 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 15 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_17(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_17: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 17 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_17: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 17 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 17 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_255(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_255: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 255 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_255: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 255 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_257(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_257: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 257 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_257: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 257 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 257 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_65535: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_65535: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 65535 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_65537: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, 1 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_65537: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, 1 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 65537 + ret iXLen2 %a +} + +define iXLen2 @test_udiv_12(iXLen2 %x) nounwind { +; RV32-LABEL: test_udiv_12: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 12 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __udivdi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_udiv_12: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 12 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __udivti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = udiv iXLen2 %x, 12 + ret iXLen2 %a +} diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen2/i64/g' %s | llc -mtriple=riscv32 -mattr=+m | \ +; RUN: FileCheck %s --check-prefix=RV32 +; RUN: sed 's/iXLen2/i128/g' %s | llc -mtriple=riscv64 -mattr=+m | \ +; RUN: FileCheck %s --check-prefix=RV64 + +define iXLen2 @test_urem_3(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_3: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 3 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_3: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 3 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 3 + ret iXLen2 %a +} + +define iXLen2 @test_urem_5(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_5: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 5 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_5: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 5 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 5 + ret iXLen2 %a +} + +define iXLen2 @test_urem_7(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_7: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 7 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_7: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 7 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 7 + ret iXLen2 %a +} + +define iXLen2 @test_urem_9(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_9: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 9 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_9: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 9 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 9 + ret iXLen2 %a +} + +define iXLen2 @test_urem_15(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_15: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 15 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_15: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 15 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 15 + ret iXLen2 %a +} + +define iXLen2 @test_urem_17(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_17: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 17 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_17: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 17 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 17 + ret iXLen2 %a +} + +define iXLen2 @test_urem_255(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_255: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 255 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_255: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 255 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 255 + ret iXLen2 %a +} + +define iXLen2 @test_urem_257(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_257: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 257 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_257: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 257 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 257 + ret iXLen2 %a +} + +define iXLen2 @test_urem_65535(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_65535: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_65535: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 65535 + ret iXLen2 %a +} + +define iXLen2 @test_urem_65537(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_65537: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, 1 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_65537: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, 1 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 65537 + ret iXLen2 %a +} + +define iXLen2 @test_urem_12(iXLen2 %x) nounwind { +; RV32-LABEL: test_urem_12: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 12 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __umoddi3@plt +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: test_urem_12: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: li a2, 12 +; RV64-NEXT: li a3, 0 +; RV64-NEXT: call __umodti3@plt +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = urem iXLen2 %x, 12 + ret iXLen2 %a +} + diff --git a/llvm/test/CodeGen/RISCV/trunc-free.ll b/llvm/test/CodeGen/RISCV/trunc-free.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/trunc-free.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 | FileCheck %s + +; Make sure we use lwu for the load, and don't emit +; a sext.w for the compare. This requires isTruncateFree +; to return true for i64->i32. Otherwise we emit a +; lw and a shift pair for the zext. + +define void @foo(i32* %p, i64* %q, i32* %r) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: lwu a0, 0(a0) +; CHECK-NEXT: sd a0, 0(a1) +; CHECK-NEXT: beqz a0, .LBB0_2 +; CHECK-NEXT: # %bb.1: # %if +; CHECK-NEXT: sw a0, 0(a2) +; CHECK-NEXT: .LBB0_2: # %end +; CHECK-NEXT: ret + %a = load i32, i32* %p + %b = zext i32 %a to i64 + store i64 %b, i64* %q + %c = icmp ne i32 %a, 0 + br i1 %c, label %if, label %end + +if: + store i32 %a, i32* %r + br label %end + +end: + ret void +} diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll --- a/llvm/test/CodeGen/X86/avx-insertelt.ll +++ b/llvm/test/CodeGen/X86/avx-insertelt.ll @@ -422,7 +422,7 @@ define <4 x i64> @insert_i64_two_elts_of_high_subvector(<4 x i64> %x, i64 %s) { ; AVX-LABEL: insert_i64_two_elts_of_high_subvector: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 +; AVX-NEXT: vmovq %rdi, %xmm1 ; AVX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -523,7 +523,7 @@ define <4 x i64> @insert_i64_two_elts_of_low_subvector(<4 x i64> %x, i64 %s) { ; AVX-LABEL: insert_i64_two_elts_of_low_subvector: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 +; AVX-NEXT: vmovq %rdi, %xmm1 ; AVX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -80,13 +80,13 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_pow2c: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psllq $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psllq $4, %xmm2 ; SSE-NEXT: psllq $2, %xmm1 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_mul_pow2c: diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -456,3 +456,481 @@ %6 = insertvalue { i64, i32 } %5, i32 %4, 1 ret { i64, i32 } %6 } + +define i64 @urem_i64_3(i64 %x) nounwind { +; X32-LABEL: urem_i64_3: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $3 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_3: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rdx,2), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 3 + ret i64 %rem +} + +define i64 @urem_i64_5(i64 %x) nounwind { +; X32-LABEL: urem_i64_5: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $5 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_5: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq $2, %rdx +; X64-NEXT: leaq (%rdx,%rdx,4), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 5 + ret i64 %rem +} + +define i64 @urem_i64_15(i64 %x) nounwind { +; X32-LABEL: urem_i64_15: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $15 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_15: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq $3, %rdx +; X64-NEXT: leaq (%rdx,%rdx,4), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 15 + ret i64 %rem +} + +define i64 @urem_i64_17(i64 %x) nounwind { +; X32-LABEL: urem_i64_17: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $17 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_17: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: andq $-16, %rax +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 17 + ret i64 %rem +} + +define i64 @urem_i64_255(i64 %x) nounwind { +; X32-LABEL: urem_i64_255: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $255 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_255: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq $7, %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shlq $8, %rax +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: leaq (%rdx,%rdi), %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 255 + ret i64 %rem +} + +define i64 @urem_i64_257(i64 %x) nounwind { +; X32-LABEL: urem_i64_257: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $257 # imm = 0x101 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_257: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: andq $-256, %rax +; X64-NEXT: shrq $8, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 257 + ret i64 %rem +} + +define i64 @urem_i64_65535(i64 %x) nounwind { +; X32-LABEL: urem_i64_65535: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $65535 # imm = 0xFFFF +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_65535: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq $15, %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shlq $16, %rax +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: leaq (%rdx,%rdi), %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 65535 + ret i64 %rem +} + +define i64 @urem_i64_65537(i64 %x) nounwind { +; X32-LABEL: urem_i64_65537: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $65537 # imm = 0x10001 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_65537: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000 +; X64-NEXT: shrq $16, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 65537 + ret i64 %rem +} + +define i64 @urem_i64_12(i64 %x) nounwind { +; X32-LABEL: urem_i64_12: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $12 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __umoddi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: urem_i64_12: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: andq $-4, %rdx +; X64-NEXT: leaq (%rdx,%rdx,2), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq +entry: + %rem = urem i64 %x, 12 + ret i64 %rem +} + +define i64 @udiv_i64_3(i64 %x) nounwind { +; X32-LABEL: udiv_i64_3: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $3 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_3: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 3 + ret i64 %rem +} + +define i64 @udiv_i64_5(i64 %x) nounwind { +; X32-LABEL: udiv_i64_5: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $5 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_5: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $2, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 5 + ret i64 %rem +} + +define i64 @udiv_i64_15(i64 %x) nounwind { +; X32-LABEL: udiv_i64_15: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $15 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_15: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $3, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 15 + ret i64 %rem +} + +define i64 @udiv_i64_17(i64 %x) nounwind { +; X32-LABEL: udiv_i64_17: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $17 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_17: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $4, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 17 + ret i64 %rem +} + +define i64 @udiv_i64_255(i64 %x) nounwind { +; X32-LABEL: udiv_i64_255: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $255 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_255: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $7, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 255 + ret i64 %rem +} + +define i64 @udiv_i64_257(i64 %x) nounwind { +; X32-LABEL: udiv_i64_257: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $257 # imm = 0x101 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_257: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $8, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 257 + ret i64 %rem +} + +define i64 @udiv_i64_65535(i64 %x) nounwind { +; X32-LABEL: udiv_i64_65535: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $65535 # imm = 0xFFFF +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_65535: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $15, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 65535 + ret i64 %rem +} + +define i64 @udiv_i64_65537(i64 %x) nounwind { +; X32-LABEL: udiv_i64_65537: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $65537 # imm = 0x10001 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_65537: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $16, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 65537 + ret i64 %rem +} + +define i64 @udiv_i64_12(i64 %x) nounwind { +; X32-LABEL: udiv_i64_12: +; X32: # %bb.0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: pushl $0 +; X32-NEXT: pushl $12 +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: pushl {{[0-9]+}}(%esp) +; X32-NEXT: calll __udivdi3 +; X32-NEXT: addl $28, %esp +; X32-NEXT: retl +; +; X64-LABEL: udiv_i64_12: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: shrq $3, %rax +; X64-NEXT: retq +entry: + %rem = udiv i64 %x, 12 + ret i64 %rem +} diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -123,3 +123,543 @@ %2 = trunc i128 %1 to i64 ret i64 %2 } + +define i128 @urem_i128_3(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_3: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $3, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_3: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 3 + ret i128 %rem +} + +define i128 @urem_i128_5(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_5: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $5, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_5: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $5, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 5 + ret i128 %rem +} + +define i128 @urem_i128_15(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_15: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $15, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_15: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $15, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 15 + ret i128 %rem +} + +define i128 @urem_i128_17(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_17: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $17, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_17: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $17, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 17 + ret i128 %rem +} + +define i128 @urem_i128_255(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_255: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $255, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_255: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $255, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 255 + ret i128 %rem +} + +define i128 @urem_i128_257(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_257: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $257, %edx # imm = 0x101 +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_257: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $257, {{[0-9]+}}(%rsp) # imm = 0x101 +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 257 + ret i128 %rem +} + +define i128 @urem_i128_65535(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_65535: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_65535: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 65535 + ret i128 %rem +} + +define i128 @urem_i128_65537(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_65537: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $65537, %edx # imm = 0x10001 +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_65537: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001 +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 65537 + ret i128 %rem +} + +define i128 @urem_i128_12(i128 %x) nounwind { +; X86-64-LABEL: urem_i128_12: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $12, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __umodti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: urem_i128_12: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __umodti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = urem i128 %x, 12 + ret i128 %rem +} + +define i128 @udiv_i128_3(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_3: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $3, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_3: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 3 + ret i128 %rem +} + +define i128 @udiv_i128_5(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_5: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $5, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_5: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $5, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 5 + ret i128 %rem +} + +define i128 @udiv_i128_15(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_15: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $15, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_15: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $15, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 15 + ret i128 %rem +} + +define i128 @udiv_i128_17(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_17: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $17, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_17: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $17, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 17 + ret i128 %rem +} + +define i128 @udiv_i128_255(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_255: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $255, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_255: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $255, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 255 + ret i128 %rem +} + +define i128 @udiv_i128_257(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_257: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $257, %edx # imm = 0x101 +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_257: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $257, {{[0-9]+}}(%rsp) # imm = 0x101 +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 257 + ret i128 %rem +} + +define i128 @udiv_i128_65535(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_65535: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_65535: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $65535, {{[0-9]+}}(%rsp) # imm = 0xFFFF +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 65535 + ret i128 %rem +} + +define i128 @udiv_i128_65537(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_65537: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $65537, %edx # imm = 0x10001 +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_65537: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $65537, {{[0-9]+}}(%rsp) # imm = 0x10001 +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 65537 + ret i128 %rem +} + +define i128 @udiv_i128_12(i128 %x) nounwind { +; X86-64-LABEL: udiv_i128_12: +; X86-64: # %bb.0: # %entry +; X86-64-NEXT: pushq %rax +; X86-64-NEXT: movl $12, %edx +; X86-64-NEXT: xorl %ecx, %ecx +; X86-64-NEXT: callq __udivti3@PLT +; X86-64-NEXT: popq %rcx +; X86-64-NEXT: retq +; +; WIN64-LABEL: udiv_i128_12: +; WIN64: # %bb.0: # %entry +; WIN64-NEXT: subq $72, %rsp +; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $12, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx +; WIN64-NEXT: callq __udivti3 +; WIN64-NEXT: movq %xmm0, %rax +; WIN64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; WIN64-NEXT: movq %xmm0, %rdx +; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: retq +entry: + %rem = udiv i128 %x, 12 + ret i128 %rem +} diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -398,7 +398,7 @@ define <2 x i64> @freeze_shl_vec_outofrange(<2 x i64> %a0) nounwind { ; X86-LABEL: freeze_shl_vec_outofrange: ; X86: # %bb.0: -; X86-NEXT: psllq $1, %xmm0 +; X86-NEXT: paddq %xmm0, %xmm0 ; X86-NEXT: psllq $2, %xmm0 ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/gcc_except_table_bb_sections_nolpads.ll @@ -0,0 +1,44 @@ +;; Verify that @LPStart is omitted when there are no landing pads. This test +;; uses an unkown personality to force emitting the exception table. + +; RUN: llc -basic-block-sections=all -mtriple=x86_64 < %s | FileCheck %s + +declare void @throwit() +declare i32 @__unknown_ehpersonality(...) + +define void @foo(i1 %cond) uwtable personality ptr @__unknown_ehpersonality { +entry: + br i1 %cond, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + call void @throwit() + unreachable + +cond.false: ; preds = %entry + ret void +} + +; CHECK: GCC_except_table0: +; CHECK-NEXT: .Lexception0: +; CHECK-NEXT: .byte 255 # @LPStart Encoding = omit +; CHECK-NEXT: .byte 255 # @TType Encoding = omit +; CHECK-NEXT: .byte 1 # Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 .Laction_table_base0-.Lcst_begin0 +; CHECK-NEXT: .Lcst_begin0: +; CHECK-NEXT: .Lexception1: +; CHECK-NEXT: .byte 255 # @LPStart Encoding = omit +; CHECK-NEXT: .byte 255 # @TType Encoding = omit +; CHECK-NEXT: .byte 1 # Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 .Laction_table_base0-.Lcst_begin1 +; CHECK-NEXT: .Lcst_begin1: +; CHECK-NEXT: .Lexception2: +; CHECK-NEXT: .byte 255 # @LPStart Encoding = omit +; CHECK-NEXT: .byte 255 # @TType Encoding = omit +; CHECK-NEXT: .byte 1 # Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 .Laction_table_base0-.Lcst_begin2 +; CHECK-NEXT: .Lcst_begin2: +; CHECK-NEXT: .uleb128 foo.__part.2-foo.__part.2 # >> Call Site 1 << +; CHECK-NEXT: .uleb128 .LBB_END0_2-foo.__part.2 # Call between foo.__part.2 and .LBB_END0_2 +; CHECK-NEXT: .byte 0 # has no landing pad +; CHECK-NEXT: .byte 0 # On action: cleanup +; CHECK-NEXT: .Laction_table_base0: diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -157,71 +157,71 @@ ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax -; SSE2-NEXT: movdqa c+144(%rip), %xmm0 -; SSE2-NEXT: movdqa c+128(%rip), %xmm1 +; SSE2-NEXT: movdqa c+128(%rip), %xmm0 +; SSE2-NEXT: movdqa c+144(%rip), %xmm1 ; SSE2-NEXT: addl c+128(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: paddd %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: psubd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE2-NEXT: movdqa %xmm0, c+144(%rip) +; SSE2-NEXT: movdqa %xmm1, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm0 +; SSE2-NEXT: movdqa c+160(%rip), %xmm1 ; SSE2-NEXT: movdqa c+176(%rip), %xmm3 ; SSE2-NEXT: movdqa d+160(%rip), %xmm5 ; SSE2-NEXT: movdqa d+176(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE2-NEXT: psubd %xmm1, %xmm7 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE2-NEXT: psubd %xmm0, %xmm7 ; SSE2-NEXT: psubd %xmm3, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm5 +; SSE2-NEXT: psubd %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm5, d+160(%rip) ; SSE2-NEXT: movdqa %xmm6, d+176(%rip) ; SSE2-NEXT: movdqa %xmm4, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) ; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, c+160(%rip) +; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, c+160(%rip) ; SSE2-NEXT: movdqa %xmm3, c+176(%rip) ; SSE2-NEXT: retq ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: ; SSE42-NEXT: movl b(%rip), %eax -; SSE42-NEXT: movdqa c+144(%rip), %xmm0 -; SSE42-NEXT: movdqa c+128(%rip), %xmm1 +; SSE42-NEXT: movdqa c+128(%rip), %xmm0 +; SSE42-NEXT: movdqa c+144(%rip), %xmm1 ; SSE42-NEXT: addl c+128(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 -; SSE42-NEXT: paddd %xmm1, %xmm2 +; SSE42-NEXT: paddd %xmm0, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 -; SSE42-NEXT: psubd %xmm0, %xmm3 -; SSE42-NEXT: paddd %xmm0, %xmm0 -; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: paddd %xmm1, %xmm4 +; SSE42-NEXT: psubd %xmm1, %xmm3 +; SSE42-NEXT: paddd %xmm1, %xmm1 +; SSE42-NEXT: movdqa %xmm0, %xmm4 +; SSE42-NEXT: paddd %xmm0, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; SSE42-NEXT: movdqa %xmm0, c+144(%rip) +; SSE42-NEXT: movdqa %xmm1, c+144(%rip) ; SSE42-NEXT: movdqa %xmm4, c+128(%rip) -; SSE42-NEXT: movdqa c+160(%rip), %xmm0 +; SSE42-NEXT: movdqa c+160(%rip), %xmm1 ; SSE42-NEXT: movdqa c+176(%rip), %xmm2 ; SSE42-NEXT: movdqa d+160(%rip), %xmm4 ; SSE42-NEXT: movdqa d+176(%rip), %xmm5 ; SSE42-NEXT: movdqa d+128(%rip), %xmm6 -; SSE42-NEXT: pinsrd $0, %eax, %xmm1 -; SSE42-NEXT: psubd %xmm1, %xmm6 +; SSE42-NEXT: pinsrd $0, %eax, %xmm0 +; SSE42-NEXT: psubd %xmm0, %xmm6 ; SSE42-NEXT: psubd %xmm2, %xmm5 -; SSE42-NEXT: psubd %xmm0, %xmm4 +; SSE42-NEXT: psubd %xmm1, %xmm4 ; SSE42-NEXT: movdqa %xmm4, d+160(%rip) ; SSE42-NEXT: movdqa %xmm5, d+176(%rip) ; SSE42-NEXT: movdqa %xmm3, d+144(%rip) ; SSE42-NEXT: movdqa %xmm6, d+128(%rip) ; SSE42-NEXT: paddd %xmm2, %xmm2 -; SSE42-NEXT: paddd %xmm0, %xmm0 -; SSE42-NEXT: movdqa %xmm0, c+160(%rip) +; SSE42-NEXT: paddd %xmm1, %xmm1 +; SSE42-NEXT: movdqa %xmm1, c+160(%rip) ; SSE42-NEXT: movdqa %xmm2, c+176(%rip) ; SSE42-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -111,21 +111,18 @@ ; XOPAVX1-LABEL: rot_v4i32_mask_ashr0: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_mask_ashr0: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_mask_ashr0: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = ashr <4 x i32> %a0, @@ -139,7 +136,6 @@ ; XOPAVX1-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: retq @@ -147,7 +143,6 @@ ; XOPAVX2-LABEL: rot_v4i32_mask_ashr1: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -155,7 +150,6 @@ ; AVX512-LABEL: rot_v4i32_mask_ashr1: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0 -; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -581,28 +581,33 @@ ; X64-NEXT: subq $104, %rsp ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: movdqa %xmm3, %xmm2 +; X64-NEXT: psrad $31, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp +; X64-NEXT: movq %xmm0, %r15 +; X64-NEXT: movq %r15, %rbp ; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp +; X64-NEXT: shldq $31, %r15, %rbp +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %r15, %r12 ; X64-NEXT: shlq $31, %r12 ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -610,16 +615,16 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 ; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: xorl %ebx, %r15d ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %rbp, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: callq __modti3@PLT ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al +; X64-NEXT: testb %r15b, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF @@ -699,43 +704,45 @@ ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,2,3] -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: paddq %xmm1, %xmm1 -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rbx -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: sarq $63, %r12 -; X64-NEXT: shldq $31, %rbx, %r12 -; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; X64-NEXT: # xmm1 = mem[2,3,2,3] -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm1, %rdx +; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = mem[0,1,1,3] +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrad $31, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; X64-NEXT: psrlq $31, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %rbx +; X64-NEXT: movq %rbx, %r13 +; X64-NEXT: sarq $63, %r13 +; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpgtd %xmm0, %xmm1 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: sarq $63, %rbp ; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: shlq $31, %r15 ; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %r13 +; X64-NEXT: subq $1, %r12 ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx ; X64-NEXT: xorl %ebp, %ebx ; X64-NEXT: movq %r15, %rdi -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rbp, %rcx ; X64-NEXT: callq __modti3@PLT @@ -743,25 +750,25 @@ ; X64-NEXT: setne %al ; X64-NEXT: testb %bl, %al ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF -; X64-NEXT: cmovbq %r13, %rax +; X64-NEXT: cmovbq %r12, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovnsq %rcx, %r13 -; X64-NEXT: cmoveq %rax, %r13 +; X64-NEXT: cmovnsq %rcx, %r12 +; X64-NEXT: cmoveq %rax, %r12 ; X64-NEXT: movl $0, %eax ; X64-NEXT: cmovnsq %rax, %r14 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: cmpq %rcx, %r12 ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: cmovaq %r13, %rax +; X64-NEXT: cmovaq %r12, %rax ; X64-NEXT: testq %r14, %r14 -; X64-NEXT: cmovsq %rcx, %r13 +; X64-NEXT: cmovsq %rcx, %r12 ; X64-NEXT: cmpq $-1, %r14 -; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: cmoveq %rax, %r12 +; X64-NEXT: movq %r12, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] @@ -816,12 +823,12 @@ ; X64-NEXT: cmovsq %rcx, %r12 ; X64-NEXT: cmpq $-1, %r14 ; X64-NEXT: cmoveq %rax, %r12 -; X64-NEXT: movq %r12, %xmm0 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: movq %r12, %xmm1 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: psrlq $1, %xmm0 +; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2] ; X64-NEXT: addq $104, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -840,116 +847,108 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $256, %esp # imm = 0x100 -; X86-NEXT: movl 24(%ebp), %edx -; X86-NEXT: movl 40(%ebp), %edi -; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: addl %edx, %edx -; X86-NEXT: adcl %eax, %eax +; X86-NEXT: movl 16(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $1, %eax -; X86-NEXT: negl %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: leal (%edi,%edi), %eax +; X86-NEXT: shrl $31, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edx +; X86-NEXT: calll __divti3 +; X86-NEXT: addl $32, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl 32(%ebp) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl $0 +; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 36(%ebp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl 20(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %edx, %edx +; X86-NEXT: movl 36(%ebp), %edx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shldl $31, %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %ecx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: leal (%ecx,%ecx), %eax +; X86-NEXT: shrl $31, %ecx +; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $1, %edx -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl 28(%ebp), %ebx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: addl %eax, %eax -; X86-NEXT: adcl %esi, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: leal (%ecx,%ecx), %eax +; X86-NEXT: shrl $31, %ecx ; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $1, %esi -; X86-NEXT: negl %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ebx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edi ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 40(%ebp), %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl 16(%ebp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: adcl %ebx, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl $31, %ecx, %edi -; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: leal (%ecx,%ecx), %eax +; X86-NEXT: shrl $31, %ecx +; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $1, %ebx -; X86-NEXT: negl %ebx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp @@ -958,39 +957,25 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl %eax -; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx ; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 36(%ebp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp @@ -1005,22 +990,22 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %bl ; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %al -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %ah -; X86-NEXT: xorb %al, %ah +; X86-NEXT: sets %bh +; X86-NEXT: xorb %bl, %bh ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %edi, %eax ; X86-NEXT: setne %al -; X86-NEXT: testb %ah, %al +; X86-NEXT: testb %bh, %al ; X86-NEXT: cmovel %esi, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1030,7 +1015,7 @@ ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: cmovel %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1047,7 +1032,7 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bl ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %bh @@ -1085,11 +1070,11 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %bl ; X86-NEXT: xorb %al, %bl ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -1100,7 +1085,7 @@ ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -382,93 +382,85 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: addl %eax, %eax -; X86-NEXT: setb %cl -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: shll $31, %eax +; X86-NEXT: leal (%eax,%eax), %ecx +; X86-NEXT: shrl $31, %eax +; X86-NEXT: shldl $31, %ecx, %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %eax +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %ebp, %ebp -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %ebp, %eax -; X86-NEXT: shll $31, %ebp +; X86-NEXT: movl %edx, %edi +; X86-NEXT: leal (%ebx,%ebx), %eax +; X86-NEXT: shrl $31, %ebx +; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %edi, %edi -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %edi, %eax -; X86-NEXT: shll $31, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: leal (%esi,%esi), %eax +; X86-NEXT: shrl $31, %esi +; X86-NEXT: shldl $31, %eax, %esi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: addl %esi, %esi -; X86-NEXT: setb %al -; X86-NEXT: shldl $31, %esi, %eax -; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: leal (%edx,%edx), %ecx +; X86-NEXT: shrl $31, %edx +; X86-NEXT: shldl $31, %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: cmpl $2, %esi +; X86-NEXT: movl $-1, %edx +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmpl $1, %esi +; X86-NEXT: movl $1, %ebp +; X86-NEXT: cmovael %ebp, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: cmpl $2, %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmpl $1, %ebx +; X86-NEXT: cmovael %ebp, %ebx +; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: cmpl $2, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmpl $1, %edi +; X86-NEXT: cmovael %ebp, %edi +; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $2, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovael %esi, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: cmpl $1, %edx -; X86-NEXT: movl $1, %ecx -; X86-NEXT: cmovael %ecx, %edx -; X86-NEXT: shldl $31, %eax, %edx -; X86-NEXT: cmpl $2, %edi -; X86-NEXT: cmovael %esi, %ebx -; X86-NEXT: cmpl $1, %edi -; X86-NEXT: cmovael %ecx, %edi -; X86-NEXT: shldl $31, %ebx, %edi -; X86-NEXT: cmpl $2, %ebp -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovael %esi, %eax -; X86-NEXT: cmpl $1, %ebp -; X86-NEXT: cmovael %ecx, %ebp +; X86-NEXT: cmovbl %edx, %ebp ; X86-NEXT: shldl $31, %eax, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: cmpl $2, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovael %esi, %eax -; X86-NEXT: cmpl $1, %ebx -; X86-NEXT: cmovbl %ebx, %ecx -; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl %ebp, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: addl $8, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -36,7 +36,7 @@ ; SSE2-NEXT: psrlq %xmm4, %xmm1 ; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1] ; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: psllq $1, %xmm0 +; SSE2-NEXT: paddq %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psllq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -56,12 +56,12 @@ ; SSE41-NEXT: psrlq %xmm4, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pandn %xmm3, %xmm2 -; SSE41-NEXT: psllq $1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllq %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSE41-NEXT: paddq %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psllq %xmm1, %xmm3 ; SSE41-NEXT: psllq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: retq ; @@ -74,11 +74,11 @@ ; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -88,7 +88,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -99,7 +99,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -110,7 +110,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -121,7 +121,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -142,7 +142,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -156,13 +156,13 @@ ; XOPAVX1-LABEL: var_funnnel_v2i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -172,7 +172,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -188,7 +188,7 @@ ; X86-SSE2-NEXT: psrlq %xmm5, %xmm1 ; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm4, %xmm2 -; X86-SSE2-NEXT: psllq $1, %xmm0 +; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm2, %xmm1 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -225,7 +225,7 @@ ; SSE2-NEXT: pslld $23, %xmm2 ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; SSE2-NEXT: pslld $1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -260,7 +260,7 @@ ; SSE41-NEXT: pslld $23, %xmm2 ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm1 -; SSE41-NEXT: pslld $1, %xmm0 +; SSE41-NEXT: paddd %xmm0, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: por %xmm6, %xmm0 ; SSE41-NEXT: retq @@ -285,7 +285,7 @@ ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -296,7 +296,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -307,7 +307,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -318,7 +318,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -329,7 +329,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -350,7 +350,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -364,13 +364,13 @@ ; XOPAVX1-LABEL: var_funnnel_v4i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] -; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 -; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -380,7 +380,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0 ; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq @@ -409,7 +409,7 @@ ; X86-SSE2-NEXT: pslld $23, %xmm2 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1 -; X86-SSE2-NEXT: pslld $1, %xmm0 +; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -473,7 +473,7 @@ ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm4, %xmm2 -; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 @@ -519,7 +519,7 @@ ; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: psllw $1, %xmm3 +; SSE41-NEXT: paddw %xmm3, %xmm3 ; SSE41-NEXT: pmullw %xmm0, %xmm3 ; SSE41-NEXT: por %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 @@ -554,7 +554,7 @@ ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -608,7 +608,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -630,7 +630,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -644,13 +644,13 @@ ; XOP-LABEL: var_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 -; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOP-NEXT: vpsubw %xmm4, %xmm5, %xmm4 +; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; @@ -703,7 +703,7 @@ ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm4, %xmm2 -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm3, %xmm0 @@ -1036,7 +1036,7 @@ ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: psrlq %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: psllq $1, %xmm0 +; SSE-NEXT: paddq %xmm0, %xmm0 ; SSE-NEXT: psllq %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1047,7 +1047,7 @@ ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1058,7 +1058,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1069,7 +1069,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1080,7 +1080,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1101,7 +1101,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1119,7 +1119,7 @@ ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -1131,7 +1131,7 @@ ; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: psllq $1, %xmm0 +; X86-SSE2-NEXT: paddq %xmm0, %xmm0 ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -1256,7 +1256,7 @@ ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: psrlw %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: psllw $1, %xmm0 +; SSE-NEXT: paddw %xmm0, %xmm0 ; SSE-NEXT: psllw %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq @@ -1267,7 +1267,7 @@ ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1278,7 +1278,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1289,7 +1289,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1300,7 +1300,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1321,7 +1321,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1339,7 +1339,7 @@ ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -1351,7 +1351,7 @@ ; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1 ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: psllw %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -1761,7 +1761,7 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psllw $1, %xmm0 +; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 @@ -1772,7 +1772,7 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = ; SSE41-NEXT: pmulhuw %xmm1, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1781,7 +1781,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -1790,7 +1790,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -1799,7 +1799,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1810,7 +1810,7 @@ ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8] -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1829,7 +1829,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v8i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1843,7 +1843,7 @@ ; XOP-LABEL: constant_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq @@ -1853,7 +1853,7 @@ ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: psllw $1, %xmm0 +; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -37,17 +37,17 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllq $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -58,7 +58,7 @@ ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -69,7 +69,7 @@ ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -80,7 +80,7 @@ ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -91,7 +91,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -111,7 +111,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -125,23 +125,23 @@ ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] -; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 +; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; XOPAVX1-NEXT: vpsllq $1, %xmm6, %xmm6 -; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; XOPAVX1-NEXT: vpsubq %xmm5, %xmm6, %xmm5 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; XOPAVX1-NEXT: vpshlq %xmm5, %xmm7, %xmm5 +; XOPAVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3 -; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 -; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 +; XOPAVX1-NEXT: vpshlq %xmm3, %xmm4, %xmm3 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -151,7 +151,7 @@ ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -184,7 +184,7 @@ ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpslld $1, %xmm7, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -202,7 +202,7 @@ ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -214,7 +214,7 @@ ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -225,7 +225,7 @@ ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -236,7 +236,7 @@ ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -247,7 +247,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -267,7 +267,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -289,13 +289,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [31,31,31,31] ; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpslld $1, %xmm7, %xmm7 +; XOPAVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 ; XOPAVX1-NEXT: vpshld %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -307,7 +307,7 @@ ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -348,7 +348,7 @@ ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpsllw $1, %xmm7, %xmm7 +; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm6 @@ -375,7 +375,7 @@ ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 @@ -427,7 +427,7 @@ ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -447,7 +447,7 @@ ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -469,13 +469,13 @@ ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15] ; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; XOPAVX1-NEXT: vpsllw $1, %xmm7, %xmm7 +; XOPAVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7 ; XOPAVX1-NEXT: vpshlw %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -484,22 +484,22 @@ ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 +; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 -; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5 -; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 -; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; XOPAVX2-NEXT: vpsubw %xmm5, %xmm6, %xmm5 +; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; XOPAVX2-NEXT: vpshlw %xmm5, %xmm7, %xmm5 +; XOPAVX2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3 -; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3 -; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2 -; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 -; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) @@ -782,9 +782,9 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsllq $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -796,7 +796,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -807,7 +807,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -818,7 +818,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -829,7 +829,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -849,7 +849,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -871,9 +871,9 @@ ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpsllq $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -885,7 +885,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1020,11 +1020,11 @@ ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -1036,7 +1036,7 @@ ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1047,7 +1047,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1058,7 +1058,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1069,7 +1069,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1089,7 +1089,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1109,11 +1109,11 @@ ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpsllw $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 @@ -1125,7 +1125,7 @@ ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1494,10 +1494,10 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1508,7 +1508,7 @@ ; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1518,7 +1518,7 @@ ; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -1528,7 +1528,7 @@ ; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1539,7 +1539,7 @@ ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1556,7 +1556,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1573,10 +1573,10 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm2 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1587,7 +1587,7 @@ ; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -22,7 +22,7 @@ ; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -33,7 +33,7 @@ ; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -44,7 +44,7 @@ ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -61,7 +61,7 @@ ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -82,7 +82,7 @@ ; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -93,7 +93,7 @@ ; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -104,7 +104,7 @@ ; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -121,7 +121,7 @@ ; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -188,7 +188,7 @@ ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -205,7 +205,7 @@ ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -428,7 +428,7 @@ ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -439,7 +439,7 @@ ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -450,7 +450,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -468,7 +468,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -554,9 +554,9 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -572,9 +572,9 @@ ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-NEXT: vpsllw $1, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -586,7 +586,7 @@ ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -604,7 +604,7 @@ ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq @@ -853,7 +853,7 @@ ; AVX512BW-LABEL: constant_funnnel_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -867,7 +867,7 @@ ; AVX512VLBW-LABEL: constant_funnnel_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -963,7 +963,7 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psrlw %xmm3, %xmm4 ; SSE41-NEXT: pandn %xmm2, %xmm1 -; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: paddw %xmm0, %xmm0 ; SSE41-NEXT: psllw %xmm1, %xmm0 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: retq @@ -974,7 +974,7 @@ ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -985,7 +985,7 @@ ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -996,7 +996,7 @@ ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: retq @@ -1007,7 +1007,7 @@ ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: retq @@ -1018,7 +1018,7 @@ ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -789,11 +789,11 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5 ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $1, %xmm4, %xmm2 +; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -805,7 +805,7 @@ ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -816,7 +816,7 @@ ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -827,7 +827,7 @@ ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -838,7 +838,7 @@ ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: retq @@ -849,7 +849,7 @@ ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -301,9 +301,9 @@ ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpsllw $1, %ymm4, %ymm2 +; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm2 ; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 @@ -318,9 +318,9 @@ ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsllw $1, %ymm4, %ymm2 +; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 @@ -332,7 +332,7 @@ ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq @@ -343,7 +343,7 @@ ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 ; CHECK-NEXT: pmulhw %xmm1, %xmm0 -; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: paddw %xmm0, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) @@ -33,7 +33,7 @@ ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 ; CHECK-NEXT: pmulhuw %xmm1, %xmm0 -; CHECK-NEXT: psllw $1, %xmm0 +; CHECK-NEXT: paddw %xmm0, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.umul.fix.v4i16(<4 x i16> , <4 x i16> %a, i32 15) diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -927,23 +927,23 @@ ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psllq $1, %xmm1 -; SSE2-NEXT: psllq $7, %xmm0 -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: psllq $7, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: psllq $7, %xmm1 -; SSE41-NEXT: psllq $1, %xmm0 +; SSE41-NEXT: paddq %xmm0, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; @@ -975,9 +975,9 @@ ; X86-SSE-LABEL: constant_shift_v2i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: psllq $1, %xmm1 -; X86-SSE-NEXT: psllq $7, %xmm0 -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X86-SSE-NEXT: psllq $7, %xmm1 +; X86-SSE-NEXT: paddq %xmm0, %xmm0 +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X86-SSE-NEXT: retl %shift = shl <2 x i64> %a, ret <2 x i64> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1061,7 +1061,7 @@ ; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2 -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1101,7 +1101,7 @@ ; X86-AVX1-NEXT: vpsllq $31, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm2 -; X86-AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: retl diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s new file mode 100644 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_minimal.s @@ -0,0 +1,18 @@ +# RUN: llvm-mc -triple=i386-unknown-linux-gnu -position-independent -filetype=obj -o %t.o %s +# RUN: llvm-jitlink -noexec %t.o + + .text + .globl main + .p2align 4 + .type main,@function +main: + pushl %ebp + movl %esp, %ebp + pushl %eax + movl $0, -4(%ebp) + movl $42, %eax + addl $4, %esp + popl %ebp + retl + + .size main, .-main \ No newline at end of file diff --git a/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/i386/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'i386' in config.root.targets: + config.unsupported = True \ No newline at end of file diff --git a/llvm/test/MC/ELF/debug-hash-file.s b/llvm/test/MC/ELF/debug-hash-file.s --- a/llvm/test/MC/ELF/debug-hash-file.s +++ b/llvm/test/MC/ELF/debug-hash-file.s @@ -23,6 +23,26 @@ // DWARF5-NEXT: dir_index: 0 // DWARF5-NOT: file_names[ 1]: +// RUN: llvm-mc -triple=x86_64 -filetype=obj -g -dwarf-version=4 -fdebug-prefix-map=/MyTest=/src_root %s -o %t.4.o +// RUN: llvm-dwarfdump -debug-info -debug-line %t.4.o | FileCheck %s --check-prefixes=MAP,MAP_V4 +// RUN: llvm-mc -triple=x86_64 -filetype=obj -g -dwarf-version=5 -fdebug-prefix-map=/MyTest=/src_root %s -o %t.5.o +// RUN: llvm-dwarfdump -debug-info -debug-line %t.5.o | FileCheck %s --check-prefixes=MAP,MAP_V5 + +// MAP-LABEL: DW_TAG_compile_unit +// MAP: DW_AT_name ("/src_root/Inputs{{(/|\\)+}}other.S") +// MAP-LABEL: DW_TAG_label +// MAP: DW_AT_decl_file ("/src_root/Inputs{{(/|\\)+}}other.S") + +// MAP_V4: include_directories[ 1] = "/src_root/Inputs" +// MAP_V4-NEXT: file_names[ 1]: +// MAP_V4-NEXT: name: "other.S" +// MAP_V4-NEXT: dir_index: 1 + +// MAP_V5: include_directories[ 0] = "{{.*}}" +// MAP_V5-NEXT: file_names[ 0]: +// MAP_V5-NEXT: name: "/src_root/Inputs/other.S" +// MAP_V5-NEXT: dir_index: 0 + # 1 "/MyTest/Inputs/other.S" foo: diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll --- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll +++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll @@ -275,7 +275,7 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0]], [[FOR_COND]] ] @@ -410,7 +410,7 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i64 0, [[INDVARS_IV_NEXT]] ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[FOR_END]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll --- a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll @@ -75,7 +75,7 @@ ; CHECK-NEXT: br label [[B18:%.*]] ; CHECK: B18: ; CHECK-NEXT: [[DOT02:%.*]] = phi i32 [ [[TMP33:%.*]], [[B24:%.*]] ], [ 0, [[B18_PREHEADER]] ] -; CHECK-NEXT: [[TMP33]] = add nuw nsw i32 [[DOT02]], 1 +; CHECK-NEXT: [[TMP33]] = add nuw i32 [[DOT02]], 1 ; CHECK-NEXT: [[O:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 [[DOT02]] ; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[O]], align 4 ; CHECK-NEXT: [[T:%.*]] = icmp eq i32 [[V]], 0 @@ -167,11 +167,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[SIZE]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[HSIZE:%.*]] to i64 ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[NSTEPS:%.*]], i32 1) -; CHECK-NEXT: [[WIDE_TRIP_COUNT14:%.*]] = zext i32 [[SMAX]] to i64 +; CHECK-NEXT: [[WIDE_TRIP_COUNT11:%.*]] = zext i32 [[SMAX]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV9:%.*]] = phi i64 [ [[INDVARS_IV_NEXT10:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 [[INDVARS_IV9]], [[TMP0]] +; CHECK-NEXT: [[INDVARS_IV7:%.*]] = phi i64 [ [[INDVARS_IV_NEXT8:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw i64 [[INDVARS_IV7]], [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[CMP215]], label [[FOR_BODY2_PREHEADER:%.*]], label [[FOR_INC]] ; CHECK: for.body2.preheader: @@ -188,22 +188,22 @@ ; CHECK: for.body3.preheader: ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[WIDE_TRIP_COUNT7:%.*]] = zext i32 [[SIZE]] to i64 +; CHECK-NEXT: [[WIDE_TRIP_COUNT5:%.*]] = zext i32 [[SIZE]] to i64 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_BODY3]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV3]] +; CHECK-NEXT: [[INDVARS_IV2:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV2]] ; CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, i8* [[BC0]], i64 [[TMP7]] ; CHECK-NEXT: store i8 [[TMP1]], i8* [[ADD_PTR2]], align 1 -; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 -; CHECK-NEXT: [[EXITCOND8:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT4]], [[WIDE_TRIP_COUNT7]] -; CHECK-NEXT: br i1 [[EXITCOND8]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1 +; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], [[WIDE_TRIP_COUNT5]] +; CHECK-NEXT: br i1 [[EXITCOND6]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]] ; CHECK: for.inc.loopexit: ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: -; CHECK-NEXT: [[INDVARS_IV_NEXT10]] = add nuw nsw i64 [[INDVARS_IV9]], 1 -; CHECK-NEXT: [[EXITCOND15:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT10]], [[WIDE_TRIP_COUNT14]] -; CHECK-NEXT: br i1 [[EXITCOND15]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT8]] = add nuw nsw i64 [[INDVARS_IV7]], 1 +; CHECK-NEXT: [[EXITCOND12:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT8]], [[WIDE_TRIP_COUNT11]] +; CHECK-NEXT: br i1 [[EXITCOND12]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll --- a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll @@ -77,10 +77,17 @@ ; CHECK: general_case24: ; CHECK-NEXT: br i1 false, label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]] ; CHECK: loop2.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = udiv i32 14, [[LOCAL_0_]] +; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 60392, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 60392 ; CHECK-NEXT: br label [[LOOP2:%.*]] ; CHECK: loop2: +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[TMP5]], -1 ; CHECK-NEXT: [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8 -; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], -1 +; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT]] ; CHECK-NEXT: store atomic i64 [[I6]], i64* [[P1]] unordered, align 8 ; CHECK-NEXT: br i1 true, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] ; CHECK: loop2.exit.loopexit: diff --git a/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll b/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll --- a/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll +++ b/llvm/test/Transforms/IndVarSimplify/bbi-63564.ll @@ -19,7 +19,7 @@ ; CHECK-NEXT: br label [[FOR_BODY2:%.*]] ; CHECK: for.body2: ; CHECK-NEXT: [[INC2:%.*]] = phi i16 [ undef, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY2]] ] -; CHECK-NEXT: [[INC]] = add nuw nsw i16 [[INC2]], 1 +; CHECK-NEXT: [[INC]] = add nsw i16 [[INC2]], 1 ; CHECK-NEXT: store i16 [[INC]], i16* undef, align 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY2]], label [[CRIT_EDGE:%.*]] ; CHECK: crit_edge: diff --git a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll --- a/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll +++ b/llvm/test/Transforms/IndVarSimplify/cycled_phis.ll @@ -85,7 +85,7 @@ ; CHECK-NEXT: [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV]], [[LEN]] ; CHECK-NEXT: br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]] ; CHECK: failed.signed: @@ -161,7 +161,7 @@ ; CHECK-NEXT: [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV]], [[LEN]] ; CHECK-NEXT: br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]] ; CHECK: failed.signed: @@ -252,7 +252,7 @@ ; CHECK: signed.passed: ; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[DONE:%.*]] ; CHECK: failed.signed: @@ -354,7 +354,7 @@ ; CHECK-NEXT: [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV_START]], [[LEN]] ; CHECK-NEXT: br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[OUTER_LOOP_BACKEDGE]] ; CHECK: outer.loop.backedge: @@ -472,7 +472,7 @@ ; CHECK-NEXT: [[UNSIGNED_CMP:%.*]] = icmp ult i32 [[IV_START]], [[LEN]] ; CHECK-NEXT: br i1 [[UNSIGNED_CMP]], label [[BACKEDGE]], label [[FAILED_UNSIGNED:%.*]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[OUTER_LOOP_SELECTION:%.*]] ; CHECK: outer.loop.selection: diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll --- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll +++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll @@ -638,7 +638,7 @@ ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP26]], 0 ; CHECK-NEXT: br i1 [[TMP29]], label [[BB1]], label [[BB2_LOOPEXIT]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP30]] = add nuw nsw i32 [[VAR_1]], 1 +; CHECK-NEXT: [[TMP30]] = add nuw i32 [[VAR_1]], 1 ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[VAR_0]], 0 ; CHECK-NEXT: br i1 [[TMP31]], label [[BB3:%.*]], label [[BB0]] ; CHECK: bb2.loopexit: @@ -1003,7 +1003,7 @@ ; CHECK: checked.2: ; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[FAIL]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 758394 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: fail: @@ -1055,7 +1055,7 @@ ; CHECK: checked.2: ; CHECK-NEXT: br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]] ; CHECK: backedge: -; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 758394 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK: fail: diff --git a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll --- a/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll +++ b/llvm/test/Transforms/IndVarSimplify/finite-exit-comparisons.ll @@ -1029,7 +1029,7 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nuw i8 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add i8 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[IV_NEXT]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: diff --git a/llvm/test/Transforms/IndVarSimplify/loop-predication.ll b/llvm/test/Transforms/IndVarSimplify/loop-predication.ll --- a/llvm/test/Transforms/IndVarSimplify/loop-predication.ll +++ b/llvm/test/Transforms/IndVarSimplify/loop-predication.ll @@ -611,7 +611,7 @@ ; CHECK-NEXT: ret i32 -1 ; CHECK: guarded: ; CHECK-NEXT: store volatile i32 0, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw i32 [[I]], 1 +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 ; CHECK-NEXT: br label [[LOOP]] ; loop.preheader: diff --git a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll --- a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll +++ b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: [[CHECK_1:%.*]] = icmp slt i32 [[IV_1]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CHECK_1]], label [[GUARDED_1]], label [[FAIL_LOOPEXIT:%.*]] ; CHECK: guarded.1: -; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i32 [[IV_1]], 1 +; CHECK-NEXT: [[IV_NEXT_1]] = add nuw i32 [[IV_1]], 1 ; CHECK-NEXT: [[LOOP_COND_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[LOOP_COND_1]], label [[LOOP_1]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: loop.2: @@ -86,7 +86,7 @@ ; CHECK-NEXT: [[CHECK_2:%.*]] = icmp slt i32 [[IV_2]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CHECK_2]], label [[GUARDED_2]], label [[FAIL_LOOPEXIT1:%.*]] ; CHECK: guarded.2: -; CHECK-NEXT: [[IV_NEXT_2]] = add nuw nsw i32 [[IV_2]], 1 +; CHECK-NEXT: [[IV_NEXT_2]] = add nuw i32 [[IV_2]], 1 ; CHECK-NEXT: [[LOOP_COND_2:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[LOOP_COND_2]], label [[LOOP_2]], label [[EXIT_LOOPEXIT2:%.*]] ; CHECK: exit.loopexit: diff --git a/llvm/test/Transforms/InstCombine/known-phi-br.ll b/llvm/test/Transforms/InstCombine/known-phi-br.ll --- a/llvm/test/Transforms/InstCombine/known-phi-br.ll +++ b/llvm/test/Transforms/InstCombine/known-phi-br.ll @@ -6,11 +6,14 @@ ; the known bits of a phi edge based off a conditional branch feeding the phi. ; +declare void @use(i1) + ; TODO: %x either eq 7 or is set to 7 define i64 @limit_i64_eq_7(i64 %x) { ; CHECK-LABEL: @limit_i64_eq_7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[X:%.*]], 7 +; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: br label [[END]] @@ -20,6 +23,7 @@ ; entry: %cmp = icmp eq i64 %x, 7 + call void @use(i1 %cmp) br i1 %cmp, label %end, label %body body: br label %end @@ -32,8 +36,9 @@ define i64 @limit_i64_ne_255(i64 %x) { ; CHECK-LABEL: @limit_i64_ne_255( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[X:%.*]], 255 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[END:%.*]], label [[BODY:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[X:%.*]], 255 +; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]] ; CHECK: body: ; CHECK-NEXT: br label [[END]] ; CHECK: end: @@ -42,6 +47,7 @@ ; entry: %cmp = icmp ne i64 %x, 255 + call void @use(i1 %cmp) br i1 %cmp, label %body, label %end body: br label %end @@ -55,6 +61,7 @@ ; CHECK-LABEL: @limit_i64_ule_15( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 16 +; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 15 @@ -66,6 +73,7 @@ ; entry: %cmp = icmp ule i64 %x, 15 + call void @use(i1 %cmp) br i1 %cmp, label %end, label %body body: %mask = and i64 %x, 15 @@ -81,6 +89,7 @@ ; CHECK-LABEL: @limit_i64_uge_8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[X:%.*]], 7 +; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]] ; CHECK: body: ; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 7 @@ -92,6 +101,7 @@ ; entry: %cmp = icmp uge i64 %x, 8 + call void @use(i1 %cmp) br i1 %cmp, label %body, label %end body: %mask = and i64 %x, 7 @@ -107,6 +117,7 @@ ; CHECK-LABEL: @limit_i64_ult_8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 8 +; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 7 @@ -118,6 +129,7 @@ ; entry: %cmp = icmp ult i64 %x, 8 + call void @use(i1 %cmp) br i1 %cmp, label %end, label %body body: %mask = and i64 %x, 7 @@ -133,6 +145,7 @@ ; CHECK-LABEL: @limit_i64_ugt_7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[X:%.*]], 7 +; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BODY:%.*]], label [[END:%.*]] ; CHECK: body: ; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 7 @@ -144,6 +157,7 @@ ; entry: %cmp = icmp ugt i64 %x, 7 + call void @use(i1 %cmp) br i1 %cmp, label %body, label %end body: %mask = and i64 %x, 7 @@ -154,4 +168,62 @@ ret i64 %res } +; +; negative tests +; +; %x either ule 15 or is masked with 15 +define i64 @limit_i64_ule_15_mask3(i64 %x) { +; CHECK-LABEL: @limit_i64_ule_15_mask3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 16 +; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 15 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ] +; CHECK-NEXT: [[RES:%.*]] = and i64 [[X_MASK]], 3 +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %cmp = icmp ule i64 %x, 15 + call void @use(i1 %cmp) + br i1 %cmp, label %end, label %body +body: + %mask = and i64 %x, 15 + br label %end +end: + %x.mask = phi i64 [ %x, %entry ], [ %mask, %body ] + %res = and i64 %x.mask, 3 + ret i64 %res +} + +; %x either ult 8 or is masked with 7 +define i64 @limit_i64_ult_8_mask1(i64 %x) { +; CHECK-LABEL: @limit_i64_ult_8_mask1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[X:%.*]], 8 +; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: br i1 [[CMP]], label [[END:%.*]], label [[BODY:%.*]] +; CHECK: body: +; CHECK-NEXT: [[MASK:%.*]] = and i64 [[X]], 7 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[X_MASK:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ [[MASK]], [[BODY]] ] +; CHECK-NEXT: [[RES:%.*]] = and i64 [[X_MASK]], 1 +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %cmp = icmp ult i64 %x, 8 + call void @use(i1 %cmp) + br i1 %cmp, label %end, label %body +body: + %mask = and i64 %x, 7 + br label %end +end: + %x.mask = phi i64 [ %x, %entry ], [ %mask, %body ] + %res = and i64 %x.mask, 1 + ret i64 %res +} diff --git a/llvm/test/Transforms/InstCombine/snprintf-2.ll b/llvm/test/Transforms/InstCombine/snprintf-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/snprintf-2.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; +; Verify that snprintf calls with a constant size not exceeding INT_MAX +; and constant format string with no formatting directives are transformed +; into memcpy. Also verify that a size in excess of INT_MAX prevents +; the transformation. +; +; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s -check-prefixes=ANY,BE +; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s -check-prefixes=ANY,LE + +@s = constant [4 x i8] c"123\00" + +@adst = external global [0 x i8*] +@asiz = external global [0 x i32] + +declare i32 @snprintf(i8*, i64, i8*, ...) + + +; Verify that all snprintf calls with a bound between INT_MAX and down +; to 0 are transformed to memcpy. + +define void @fold_snprintf_fmt() { +; BE-LABEL: @fold_snprintf_fmt( +; BE-NEXT: [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PDIMAX1]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; BE-NEXT: [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PD52]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4 +; BE-NEXT: [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PD43]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; BE-NEXT: [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8 +; BE-NEXT: [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16* +; BE-NEXT: store i16 12594, i16* [[TMP1]], align 1 +; BE-NEXT: [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2 +; BE-NEXT: store i8 0, i8* [[ENDPTR]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4 +; BE-NEXT: [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; BE-NEXT: store i8 49, i8* [[PD2]], align 1 +; BE-NEXT: [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1 +; BE-NEXT: store i8 0, i8* [[ENDPTR4]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; BE-NEXT: [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; BE-NEXT: store i8 0, i8* [[PD1]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; BE-NEXT: ret void +; +; LE-LABEL: @fold_snprintf_fmt( +; LE-NEXT: [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PDIMAX1]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; LE-NEXT: [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PD52]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4 +; LE-NEXT: [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PD43]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; LE-NEXT: [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8 +; LE-NEXT: [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16* +; LE-NEXT: store i16 12849, i16* [[TMP1]], align 1 +; LE-NEXT: [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2 +; LE-NEXT: store i8 0, i8* [[ENDPTR]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4 +; LE-NEXT: [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; LE-NEXT: store i8 49, i8* [[PD2]], align 1 +; LE-NEXT: [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1 +; LE-NEXT: store i8 0, i8* [[ENDPTR4]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; LE-NEXT: [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; LE-NEXT: store i8 0, i8* [[PD1]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; LE-NEXT: ret void +; + %fmt = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0 + + %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2147483647) + %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt) + store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + %pd5 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5) + %n5 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd5, i64 5, i8* %fmt) + store i32 %n5, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5) + + %pd4 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4) + %n4 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd4, i64 4, i8* %fmt) + store i32 %n4, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4) + + %pd3 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3) + %n3 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd3, i64 3, i8* %fmt) + store i32 %n3, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3) + + %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2) + %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt) + store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2) + + %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt) + store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt) + store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + ret void +} + + +; Verify that snprintf calls with a bound greater than INT_MAX are not +; transformed. POSIX requires implementations to set errno to EOVERFLOW +; so such calls could be folded to just that followed by returning -1. + +define void @call_snprintf_fmt_ximax() { +; ANY-LABEL: @call_snprintf_fmt_ximax( +; ANY-NEXT: [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; ANY-NEXT: [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0)) +; ANY-NEXT: store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; ANY-NEXT: [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8 +; ANY-NEXT: [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0)) +; ANY-NEXT: store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; ANY-NEXT: ret void +; + %fmt = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0 + + %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt) + store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt) + store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + ret void +} diff --git a/llvm/test/Transforms/InstCombine/snprintf-3.ll b/llvm/test/Transforms/InstCombine/snprintf-3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/snprintf-3.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; +; Verify that snprintf calls with a constant size not exceeding INT_MAX +; and a "%s" format string and a const string argument are transformed +; into memcpy. Also verify that a size in excess of INT_MAX prevents +; the transformation. +; +; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s -check-prefixes=ANY,BE +; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s -check-prefixes=ANY,LE + +@pcnt_s = constant [3 x i8] c"%s\00" +@s = constant [4 x i8] c"123\00" + +@adst = external global [0 x i8*] +@asiz = external global [0 x i32] + +declare i32 @snprintf(i8*, i64, i8*, ...) + + +; Verify that all snprintf calls with a bound between INT_MAX and down +; to 0 are transformed to memcpy. + +define void @fold_snprintf_pcnt_s() { +; BE-LABEL: @fold_snprintf_pcnt_s( +; BE-NEXT: [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PDIMAX1]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; BE-NEXT: [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PD52]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4 +; BE-NEXT: [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8 +; BE-NEXT: store i32 825373440, i32* [[PD43]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; BE-NEXT: [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8 +; BE-NEXT: [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16* +; BE-NEXT: store i16 12594, i16* [[TMP1]], align 1 +; BE-NEXT: [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2 +; BE-NEXT: store i8 0, i8* [[ENDPTR]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4 +; BE-NEXT: [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; BE-NEXT: store i8 49, i8* [[PD2]], align 1 +; BE-NEXT: [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1 +; BE-NEXT: store i8 0, i8* [[ENDPTR4]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; BE-NEXT: [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; BE-NEXT: store i8 0, i8* [[PD1]], align 1 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; BE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; BE-NEXT: ret void +; +; LE-LABEL: @fold_snprintf_pcnt_s( +; LE-NEXT: [[PDIMAX1:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2147483647) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PDIMAX1]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; LE-NEXT: [[PD52:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PD52]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4 +; LE-NEXT: [[PD43:%.*]] = load i32*, i32** bitcast (i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4) to i32**), align 8 +; LE-NEXT: store i32 3355185, i32* [[PD43]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; LE-NEXT: [[PD3:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8 +; LE-NEXT: [[TMP1:%.*]] = bitcast i8* [[PD3]] to i16* +; LE-NEXT: store i16 12849, i16* [[TMP1]], align 1 +; LE-NEXT: [[ENDPTR:%.*]] = getelementptr inbounds i8, i8* [[PD3]], i64 2 +; LE-NEXT: store i8 0, i8* [[ENDPTR]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4 +; LE-NEXT: [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; LE-NEXT: store i8 49, i8* [[PD2]], align 1 +; LE-NEXT: [[ENDPTR4:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1 +; LE-NEXT: store i8 0, i8* [[ENDPTR4]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; LE-NEXT: [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; LE-NEXT: store i8 0, i8* [[PD1]], align 1 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; LE-NEXT: store i32 3, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; LE-NEXT: ret void +; + %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_s, i32 0, i32 0 + %ps = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0 + + %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2147483647) + %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt, i8* %ps) + store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + %pd5 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5) + %n5 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd5, i64 5, i8* %fmt, i8* %ps) + store i32 %n5, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5) + + %pd4 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4) + %n4 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd4, i64 4, i8* %fmt, i8* %ps) + store i32 %n4, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4) + + %pd3 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3) + %n3 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd3, i64 3, i8* %fmt, i8* %ps) + store i32 %n3, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3) + + %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2) + %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt, i8* %ps) + store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2) + + %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt, i8* %ps) + store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt, i8* %ps) + store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + ret void +} + + +; Verify that snprintf calls with a bound greater than INT_MAX are not +; transformed. POSIX requires implementations to set errno to EOVERFLOW +; so such calls could be folded to just that followed by returning -1. + +define void @call_snprintf_pcnt_s_ximax() { +; ANY-LABEL: @call_snprintf_pcnt_s_ximax( +; ANY-NEXT: [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; ANY-NEXT: [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_s, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0)) +; ANY-NEXT: store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; ANY-NEXT: [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8 +; ANY-NEXT: [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_s, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @s, i64 0, i64 0)) +; ANY-NEXT: store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; ANY-NEXT: ret void +; + %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_s, i32 0, i32 0 + %ps = getelementptr [4 x i8], [4 x i8]* @s, i32 0, i32 0 + + %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt, i8* %ps) + store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt, i8* %ps) + store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + ret void +} diff --git a/llvm/test/Transforms/InstCombine/snprintf-4.ll b/llvm/test/Transforms/InstCombine/snprintf-4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/snprintf-4.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; +; Verify that snprintf calls with a constant size not exceeding INT_MAX +; and a "%c" format string are transformed into a store of the character. +; Also verify that a size in excess of INT_MAX prevents the transformation. +; +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +@pcnt_c = constant [3 x i8] c"%c\00" + +@adst = external global [0 x i8*] +@asiz = external global [0 x i32] + +declare i32 @snprintf(i8*, i64, i8*, ...) + + +; Verify that all snprintf calls with a bound between INT_MAX and down +; to 0 are transformed to memcpy. + +define void @fold_snprintf_pcnt_c(i32 %c) { +; CHECK-LABEL: @fold_snprintf_pcnt_c( +; CHECK-NEXT: [[PDIMAX:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8 +; CHECK-NEXT: store i8 1, i8* [[PDIMAX]], align 1 +; CHECK-NEXT: [[NUL:%.*]] = getelementptr inbounds i8, i8* [[PDIMAX]], i64 1 +; CHECK-NEXT: store i8 0, i8* [[NUL]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; CHECK-NEXT: [[PD2:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; CHECK-NEXT: store i8 2, i8* [[PD2]], align 1 +; CHECK-NEXT: [[NUL1:%.*]] = getelementptr inbounds i8, i8* [[PD2]], i64 1 +; CHECK-NEXT: store i8 0, i8* [[NUL1]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; CHECK-NEXT: [[PD2_0:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; CHECK-NEXT: store i8 0, i8* [[PD2_0]], align 1 +; CHECK-NEXT: [[NUL2:%.*]] = getelementptr inbounds i8, i8* [[PD2_0]], i64 1 +; CHECK-NEXT: store i8 0, i8* [[NUL2]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; CHECK-NEXT: [[PD1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 3), align 8 +; CHECK-NEXT: store i8 0, i8* [[PD1]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 3), align 4 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; CHECK-NEXT: [[PD2_C:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 4), align 8 +; CHECK-NEXT: [[CHAR:%.*]] = trunc i32 [[C:%.*]] to i8 +; CHECK-NEXT: store i8 [[CHAR]], i8* [[PD2_C]], align 1 +; CHECK-NEXT: [[NUL3:%.*]] = getelementptr inbounds i8, i8* [[PD2_C]], i64 1 +; CHECK-NEXT: store i8 0, i8* [[NUL3]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 4), align 4 +; CHECK-NEXT: [[PD1_C:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 5), align 8 +; CHECK-NEXT: store i8 0, i8* [[PD1_C]], align 1 +; CHECK-NEXT: store i32 1, i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 5), align 4 +; CHECK-NEXT: ret void +; + %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_c, i32 0, i32 0 + + ; Transform snprintf(dst, INT_MAX, "%c", 1) to memcpy(dst, "1", 2), 1. + %pdimax = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %nimax = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimax, i64 2147483647, i8* %fmt, i32 1) + store i32 %nimax, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + ; Transform snprintf(dst, 2, "%c", '\2') to memcpy(dst, "2", 2), 1. + %pd2 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %n2 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2, i64 2, i8* %fmt, i8 2) + store i32 %n2, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + ; Transform snprintf(dst, 2, "%c", '\0') to memcpy(dst, "\0", 2), 1. + %pd2_0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2) + %n2_0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2_0, i64 2, i8* %fmt, i8 0) + store i32 %n2_0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2) + + ; Transform snprintf(dst, 1, "%c", (short)3) to memcpy(dst, "\3", 2), 1. + %pd1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 3) + %n1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1, i64 1, i8* %fmt, i16 3) + store i32 %n1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 3) + + ; Fold snprintf(dst, 0, "%c", 4) to 1. + %pd0 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4) + %n0 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd0, i64 0, i8* %fmt, i32 4) + store i32 %n0, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4) + + + ; Transform snprintf(dst, 2, "%c", c) with a nonconstant c to + ; dst[0] = c, dst[1] = '\0', 1. + %pd2_c = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 4) + %n2_c = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd2_c, i64 2, i8* %fmt, i32 %c) + store i32 %n2_c, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 4) + + ; Transform snprintf(dst, 1, "%c", c) with a nonconstant c to *dst = '\0', 0. + %pd1_c = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 5) + %n1_c = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pd1_c, i64 1, i8* %fmt, i32 %c) + store i32 %n1_c, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 5) + + ret void +} + + +; Verify that snprintf calls with a bound greater than INT_MAX are not +; transformed. POSIX requires implementations to set errno to EOVERFLOW +; so such calls could be folded to just that followed by returning -1. + +define void @call_snprintf_pcnt_c_ximax(i32 %c) { +; CHECK-LABEL: @call_snprintf_pcnt_c_ximax( +; CHECK-NEXT: [[PDM1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 0), align 8 +; CHECK-NEXT: [[NM1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 0) +; CHECK-NEXT: store i32 [[NM1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 0), align 4 +; CHECK-NEXT: [[PDIMAXP1:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 1), align 8 +; CHECK-NEXT: [[NIMAXP1:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 1) +; CHECK-NEXT: store i32 [[NIMAXP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 1), align 4 +; CHECK-NEXT: [[PDM1SL32:%.*]] = load i8*, i8** getelementptr inbounds ([0 x i8*], [0 x i8*]* @adst, i64 0, i64 2), align 8 +; CHECK-NEXT: [[NM1SL32:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[PDM1SL32]], i64 -4294967296, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @pcnt_c, i64 0, i64 0), i8 1) +; CHECK-NEXT: store i32 [[NM1SL32]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @asiz, i64 0, i64 2), align 4 +; CHECK-NEXT: ret void +; + %fmt = getelementptr [3 x i8], [3 x i8]* @pcnt_c, i32 0, i32 0 + + %pdm1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 0) + %nm1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1, i64 -1, i8* %fmt, i8 0) + store i32 %nm1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 0) + + + %pdimaxp1 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 1) + %nimaxp1 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdimaxp1, i64 2147483648, i8* %fmt, i8 1) + store i32 %nimaxp1, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 1) + + ; Exercise snprintf(dst, -1LU << 32, "%c", c). + %pdm1sl32 = load i8*, i8** getelementptr ([0 x i8*], [0 x i8*]* @adst, i32 0, i32 2) + %nm1sl32 = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %pdm1sl32, i64 18446744069414584320, i8* %fmt, i8 1) + store i32 %nm1sl32, i32* getelementptr ([0 x i32], [0 x i32]* @asiz, i32 0, i32 2) + + ret void +} diff --git a/llvm/test/Transforms/InstCombine/snprintf.ll b/llvm/test/Transforms/InstCombine/snprintf.ll --- a/llvm/test/Transforms/InstCombine/snprintf.ll +++ b/llvm/test/Transforms/InstCombine/snprintf.ll @@ -92,10 +92,10 @@ ret i32 %call } -define i32 @test_char_wrong_size(i8* %buf) #0 { -; CHECK-LABEL: @test_char_wrong_size( -; CHECK-NEXT: [[CALL:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[BUF:%.*]], i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.2, i64 0, i64 0), i32 65) -; CHECK-NEXT: ret i32 [[CALL]] +define i32 @test_char_small_size(i8* %buf) #0 { +; CHECK-LABEL: @test_char_small_size( +; CHECK-NEXT: store i8 0, i8* [[BUF:%.*]], align 1 +; CHECK-NEXT: ret i32 1 ; %call = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %buf, i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.2, i64 0, i64 0), i32 65) #2 ret i32 %call @@ -120,10 +120,10 @@ ret i32 %call } -define i32 @test_str_wrong_size(i8* %buf) #0 { -; CHECK-LABEL: @test_str_wrong_size( -; CHECK-NEXT: [[CALL:%.*]] = call i32 (i8*, i64, i8*, ...) @snprintf(i8* noundef nonnull dereferenceable(1) [[BUF:%.*]], i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.3, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) -; CHECK-NEXT: ret i32 [[CALL]] +define i32 @test_str_small_size(i8* %buf) #0 { +; CHECK-LABEL: @test_str_small_size( +; CHECK-NEXT: store i8 0, i8* [[BUF:%.*]], align 1 +; CHECK-NEXT: ret i32 3 ; %call = call i32 (i8*, i64, i8*, ...) @snprintf(i8* %buf, i64 1, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.3, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0)) #2 ret i32 %call diff --git a/llvm/test/Transforms/InstCombine/ssubo.ll b/llvm/test/Transforms/InstCombine/ssubo.ll --- a/llvm/test/Transforms/InstCombine/ssubo.ll +++ b/llvm/test/Transforms/InstCombine/ssubo.ll @@ -4,6 +4,8 @@ declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) declare { i8, i1 } @llvm.ssub.with.overflow.i8(i8, i8) +declare void @use(i1) + define i1 @test_generic(i64 %a, i64 %b) { ; CHECK-LABEL: @test_generic( ; CHECK-NEXT: [[RES:%.*]] = tail call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A:%.*]], i64 [[B:%.*]]) @@ -95,3 +97,72 @@ ret i1 %overflow } +define i1 @sub_eq0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_eq0( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i8 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[EQ0]] +; + %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %eq0 = icmp eq i8 %sub, 0 + ret i1 %eq0 +} + +define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_ne0( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[NE0:%.*]] = icmp ne i8 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[NE0]] +; + %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %ne0 = icmp ne i8 %sub, 0 + ret i1 %ne0 +} + +; negative test - need zero + +define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_eq1( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0 +; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1 +; CHECK-NEXT: ret i1 [[EQ1]] +; + %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %eq1 = icmp eq i8 %sub, 1 + ret i1 %eq1 +} + +; negative test - need equality pred + +define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_sgt0( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0 +; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0 +; CHECK-NEXT: ret i1 [[SGT0]] +; + %ss = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %sgt0 = icmp sgt i8 %sub, 0 + ret i1 %sgt0 +} diff --git a/llvm/test/Transforms/InstCombine/usubo.ll b/llvm/test/Transforms/InstCombine/usubo.ll --- a/llvm/test/Transforms/InstCombine/usubo.ll +++ b/llvm/test/Transforms/InstCombine/usubo.ll @@ -4,6 +4,8 @@ declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) declare { i8, i1 } @llvm.usub.with.overflow.i8(i8, i8) +declare void @use(i1) + define i1 @test_generic(i64 %a, i64 %b) { ; CHECK-LABEL: @test_generic( ; CHECK-NEXT: [[OVERFLOW:%.*]] = icmp ult i64 [[A:%.*]], [[B:%.*]] @@ -94,3 +96,70 @@ ret i1 %overflow } +define i1 @sub_eq0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_eq0( +; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i8 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[EQ0]] +; + %us = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %us, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %us, 0 + %eq0 = icmp eq i8 %sub, 0 + ret i1 %eq0 +} + +define i1 @sub_ne0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_ne0( +; CHECK-NEXT: [[OV:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[NE0:%.*]] = icmp ne i8 [[X]], [[Y]] +; CHECK-NEXT: ret i1 [[NE0]] +; + %us = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %us, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %us, 0 + %ne0 = icmp ne i8 %sub, 0 + ret i1 %ne0 +} + +; negative test - need zero + +define i1 @sub_eq1(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_eq1( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0 +; CHECK-NEXT: [[EQ1:%.*]] = icmp eq i8 [[SUB]], 1 +; CHECK-NEXT: ret i1 [[EQ1]] +; + %ss = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %eq1 = icmp eq i8 %sub, 1 + ret i1 %eq1 +} + +; negative test - need equality pred + +define i1 @sub_sgt0(i8 %x, i8 %y, i1 %b) { +; CHECK-LABEL: @sub_sgt0( +; CHECK-NEXT: [[SS:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[SS]], 1 +; CHECK-NEXT: call void @use(i1 [[OV]]) +; CHECK-NEXT: [[SUB:%.*]] = extractvalue { i8, i1 } [[SS]], 0 +; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[SUB]], 0 +; CHECK-NEXT: ret i1 [[SGT0]] +; + %ss = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %x, i8 %y) + %ov = extractvalue { i8, i1 } %ss, 1 + call void @use(i1 %ov) + %sub = extractvalue { i8, i1 } %ss, 0 + %sgt0 = icmp sgt i8 %sub, 0 + ret i1 %sgt0 +} diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll --- a/llvm/test/Transforms/InstSimplify/implies.ll +++ b/llvm/test/Transforms/InstSimplify/implies.ll @@ -255,3 +255,15 @@ %res = icmp sge i1 %var30, %var29 ret i1 %res } + +; X <=(s) Y == Y ==> X (i1 1 becomes -1 for reasoning) +define i1 @test_sle(i32 %length.i, i32 %i) { +; CHECK-LABEL: @test_sle( +; CHECK-NEXT: ret i1 true +; + %iplus1 = add nsw nuw i32 %i, 1 + %var29 = icmp ult i32 %i, %length.i + %var30 = icmp ult i32 %iplus1, %length.i + %res = icmp sle i1 %var29, %var30 + ret i1 %res +} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll --- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -3147,7 +3147,7 @@ ; EPILOG-NEXT: %cmp1.7 = icmp ult i32 %inc.7, %N ; EPILOG-NEXT: br i1 %cmp1.7, label %latch.7, label %latchExit.epilog-lcssa.loopexit ; EPILOG: latch.7: -; EPILOG-NEXT: %niter.next.7 = add nuw i32 %niter.next.6, 1 +; EPILOG-NEXT: %niter.next.7 = add i32 %niter.next.6, 1 ; EPILOG-NEXT: %niter.ncmp.7 = icmp ne i32 %niter.next.7, %unroll_iter ; EPILOG-NEXT: br i1 %niter.ncmp.7, label %header, label %latchExit.unr-lcssa.loopexit ; EPILOG: latchExit.unr-lcssa.loopexit: @@ -3209,7 +3209,7 @@ ; EPILOG-BLOCK-NEXT: %cmp1.1 = icmp ult i32 %inc.1, %N ; EPILOG-BLOCK-NEXT: br i1 %cmp1.1, label %latch.1, label %latchExit.epilog-lcssa.loopexit ; EPILOG-BLOCK: latch.1: -; EPILOG-BLOCK-NEXT: %niter.next.1 = add nuw i32 %niter.next, 1 +; EPILOG-BLOCK-NEXT: %niter.next.1 = add i32 %niter.next, 1 ; EPILOG-BLOCK-NEXT: %niter.ncmp.1 = icmp ne i32 %niter.next.1, %unroll_iter ; EPILOG-BLOCK-NEXT: br i1 %niter.ncmp.1, label %header, label %latchExit.unr-lcssa.loopexit, !llvm.loop !8 ; EPILOG-BLOCK: latchExit.unr-lcssa.loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=-1 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" +target triple = "riscv64" + +; Dependence distance between read and write is greater than the trip +; count of the loop. Thus, values written are never read for any +; valid vectorization of the loop. +define void @test(ptr %p) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 200 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, 200 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 +; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 200 +; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] +; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %a1 = getelementptr i64, ptr %p, i64 %iv + %v = load i64, ptr %a1, align 32 + %offset = add i64 %iv, 200 + %a2 = getelementptr i64, ptr %p, i64 %offset + store i64 %v, ptr %a2, align 32 + %iv.next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv, 199 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; Dependence distance is less than trip count, thus we must prove that +; chosen VF guaranteed to be less than dependence distance. +define void @test_may_clobber(ptr %p) { +; CHECK-LABEL: @test_may_clobber( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, 200 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 +; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 100 +; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] +; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %a1 = getelementptr i64, ptr %p, i64 %iv + %v = load i64, ptr %a1, align 32 + %offset = add i64 %iv, 100 + %a2 = getelementptr i64, ptr %p, i64 %offset + store i64 %v, ptr %a2, align 32 + %iv.next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv, 199 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; Trviailly no overlap due to maximum possible value of VLEN and LMUL +define void @trivial_due_max_vscale(ptr %p) { +; CHECK-LABEL: @trivial_due_max_vscale( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 8192 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 +; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 8192 +; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] +; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %a1 = getelementptr i64, ptr %p, i64 %iv + %v = load i64, ptr %a1, align 32 + %offset = add i64 %iv, 8192 + %a2 = getelementptr i64, ptr %p, i64 %offset + store i64 %v, ptr %a2, align 32 + %iv.next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv, 199 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; Dependence distance could be violated via LMUL>=2 or interleaving +define void @no_high_lmul_or_interleave(ptr %p) { +; CHECK-LABEL: @no_high_lmul_or_interleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1024 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 +; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 1024 +; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]] +; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %a1 = getelementptr i64, ptr %p, i64 %iv + %v = load i64, ptr %a1, align 32 + %offset = add i64 %iv, 1024 + %a2 = getelementptr i64, ptr %p, i64 %offset + store i64 %v, ptr %a2, align 32 + %iv.next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv, 199 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-postpone.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -mtriple x86_64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x double> @test(double* %p2, double %i1754, double %i1781, double %i1778) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I1771:%.*]] = getelementptr inbounds double, double* [[P2:%.*]], i64 54 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1754:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1778:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1754]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[I1771]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1781]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[I1792]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> , double [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x double> [[TMP12]], [[TMP13]] +; CHECK-NEXT: ret <4 x double> [[TMP14]] +; +entry: + %i1771 = getelementptr inbounds double, double* %p2, i64 54 + %i1772 = load double, double* %i1771, align 8 + %i1773 = fmul fast double %i1772, %i1754 + %i1782 = fmul fast double %i1754, %i1754 + %i1783 = fadd fast double %i1782, 1.000000e+00 + %i1787 = fmul fast double %i1778, %i1754 + %i1788 = fadd fast double %i1787, 1.000000e+00 + %i1792 = fmul fast double %i1754, %i1781 + %i1793 = fadd fast double %i1792, 1.000000e+00 + %i1795 = getelementptr inbounds double, double* %p2, i64 55 + %i1796 = load double, double* %i1795, align 8 + %i1797 = fmul fast double %i1796, %i1781 + %i1798 = fadd fast double %i1773, %i1797 + %i1976 = insertelement <4 x double> zeroinitializer, double %i1783, i64 0 + %i1982 = insertelement <4 x double> %i1976, double %i1788, i64 1 + %i1988 = insertelement <4 x double> %i1982, double %i1793, i64 2 + %i1994 = insertelement <4 x double> %i1988, double %i1798, i64 3 + ret <4 x double> %i1994 +} diff --git a/llvm/test/Transforms/SimplifyCFG/nonintegral.ll b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll @@ -0,0 +1,28 @@ +; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s + +target datalayout = "ni:1" + +define void @test_01(i64 addrspace(1)* align 8 %ptr) { +; CHECK-LABEL: @test_01( +; CHECK-NOT: ptrtoint +; CHECK-NEXT: icmp eq i64 addrspace(1)* %ptr, null +; CHECK-NOT: ptrtoint + %cond1 = icmp eq i64 addrspace(1)* %ptr, null + %cond2 = icmp eq i64 addrspace(1)* %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, i64 addrspace(1)* %ptr, align 8 + br label %true1 + +true2: + store i64 2, i64 addrspace(1)* %ptr, align 8 + ret void + +false2: + store i64 3, i64 addrspace(1)* %ptr, align 8 + ret void +} diff --git a/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml b/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml --- a/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml +++ b/llvm/test/tools/llvm-objdump/MachO/chained-fixups.yaml @@ -1,102 +1,107 @@ # RUN: yaml2obj %s -o %t # RUN: llvm-objdump -p %t | FileCheck %s # RUN: llvm-otool -l %t | FileCheck %s -# + # CHECK: LC_DYLD_CHAINED_FIXUPS # CHECK: LC_DYLD_EXPORTS_TRIE +# RUN: llvm-objdump --macho --chained-fixups %t | \ +# RUN: FileCheck --check-prefix=DETAILS -DNAME=%t %s +# RUN: llvm-otool -chained_fixups %t | \ +# RUN: FileCheck --check-prefix=DETAILS -DNAME=%t %s + +# DETAILS: [[NAME]]: +# DETAILS-NEXT: chained fixups header (LC_DYLD_CHAINED_FIXUPS) +# DETAILS-NEXT: fixups_version = 0 +# DETAILS-NEXT: starts_offset = 32 +# DETAILS-NEXT: imports_offset = 44 +# DETAILS-NEXT: symbols_offset = 44 +# DETAILS-NEXT: imports_count = 0 +# DETAILS-NEXT: imports_format = 1 (DYLD_CHAINED_IMPORT) +# DETAILS-NEXT: symbols_format = 0 + +## This yaml is from a dylib produced by ld64 +## echo ".global _foo\n_foo" > dylib.s +## clang -target=x86_64-apple-macos12 -dynamiclib -isysroot Inputs/MacOSX.sdk dylib.s -o libdylib.dylib +## obj2yaml --raw-segment=data libdylib.dylib --- !mach-o +IsLittleEndian: true FileHeader: magic: 0xFEEDFACF - cputype: 0x100000C - cpusubtype: 0x0 - filetype: 0x2 - ncmds: 16 - sizeofcmds: 744 - flags: 0x200085 + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x6 + ncmds: 13 + sizeofcmds: 568 + flags: 0x100085 reserved: 0x0 LoadCommands: - cmd: LC_SEGMENT_64 - cmdsize: 72 - segname: __PAGEZERO - vmaddr: 0 - vmsize: 4294967296 - fileoff: 0 - filesize: 0 - maxprot: 0 - initprot: 0 - nsects: 0 - flags: 0 - - cmd: LC_SEGMENT_64 - cmdsize: 232 + cmdsize: 152 segname: __TEXT - vmaddr: 4294967296 + vmaddr: 0 vmsize: 16384 fileoff: 0 filesize: 16384 maxprot: 5 initprot: 5 - nsects: 2 + nsects: 1 flags: 0 Sections: - sectname: __text segname: __TEXT - addr: 0x100003F98 - size: 24 - offset: 0x3F98 - align: 2 + addr: 0x4000 + size: 0 + offset: 0x4000 + align: 0 reloff: 0x0 nreloc: 0 flags: 0x80000400 reserved1: 0x0 reserved2: 0x0 reserved3: 0x0 - content: C0035FD6FF4300D100008052FF0F00B9FF430091C0035FD6 - - sectname: __unwind_info - segname: __TEXT - addr: 0x100003FB0 - size: 80 - offset: 0x3FB0 - align: 2 - reloff: 0x0 - nreloc: 0 - flags: 0x0 - reserved1: 0x0 - reserved2: 0x0 - reserved3: 0x0 - content: 010000001C000000000000001C000000000000001C00000002000000983F00003400000034000000B13F00000000000034000000030000000C0002001400020000000001040000000010000200000002 + content: '' - cmd: LC_SEGMENT_64 cmdsize: 72 segname: __LINKEDIT - vmaddr: 4294983680 + vmaddr: 16384 vmsize: 16384 fileoff: 16384 - filesize: 753 + filesize: 96 maxprot: 1 initprot: 1 nsects: 0 flags: 0 + - cmd: LC_ID_DYLIB + cmdsize: 48 + dylib: + name: 24 + timestamp: 1 + current_version: 0 + compatibility_version: 0 + Content: libdylib.dylib + ZeroPadBytes: 3 - cmd: LC_DYLD_CHAINED_FIXUPS cmdsize: 16 dataoff: 16384 - datasize: 56 + datasize: 48 - cmd: LC_DYLD_EXPORTS_TRIE cmdsize: 16 - dataoff: 16440 - datasize: 56 + dataoff: 16432 + datasize: 16 - cmd: LC_SYMTAB cmdsize: 24 - symoff: 16504 - nsyms: 15 - stroff: 16744 - strsize: 120 + symoff: 16456 + nsyms: 1 + stroff: 16472 + strsize: 8 - cmd: LC_DYSYMTAB cmdsize: 80 ilocalsym: 0 - nlocalsym: 12 - iextdefsym: 12 - nextdefsym: 3 - iundefsym: 15 + nlocalsym: 0 + iextdefsym: 0 + nextdefsym: 1 + iundefsym: 1 nundefsym: 0 tocoff: 0 ntoc: 0 @@ -110,136 +115,37 @@ nextrel: 0 locreloff: 0 nlocrel: 0 - - cmd: LC_LOAD_DYLINKER - cmdsize: 32 - name: 12 - Content: '/usr/lib/dyld' - ZeroPadBytes: 7 - cmd: LC_UUID cmdsize: 24 - uuid: F445529E-643C-3A38-8F59-AB64566BCAFF + uuid: 52409B91-DF59-346A-A63F-D4E6FFDC3E04 - cmd: LC_BUILD_VERSION cmdsize: 32 platform: 1 minos: 786432 - sdk: 786432 + sdk: 851968 ntools: 1 Tools: - tool: 3 - version: 46596096 + version: 53674242 - cmd: LC_SOURCE_VERSION cmdsize: 16 version: 0 - - cmd: LC_MAIN - cmdsize: 24 - entryoff: 16284 - stacksize: 0 - cmd: LC_LOAD_DYLIB cmdsize: 56 dylib: name: 24 timestamp: 2 - current_version: 85917696 + current_version: 65793 compatibility_version: 65536 - Content: '/usr/lib/libSystem.B.dylib' - ZeroPadBytes: 6 + Content: '/usr/lib/libSystem.dylib' + ZeroPadBytes: 8 - cmd: LC_FUNCTION_STARTS cmdsize: 16 - dataoff: 16496 + dataoff: 16448 datasize: 8 - cmd: LC_DATA_IN_CODE cmdsize: 16 - dataoff: 16504 + dataoff: 16456 datasize: 0 - - cmd: LC_CODE_SIGNATURE - cmdsize: 16 - dataoff: 16864 - datasize: 273 -LinkEditData: - NameList: - - n_strx: 33 - n_type: 0x64 - n_sect: 0 - n_desc: 0 - n_value: 0 - - n_strx: 39 - n_type: 0x64 - n_sect: 0 - n_desc: 0 - n_value: 0 - - n_strx: 46 - n_type: 0x66 - n_sect: 0 - n_desc: 1 - n_value: 1636754403 - - n_strx: 1 - n_type: 0x2E - n_sect: 1 - n_desc: 0 - n_value: 4294983576 - - n_strx: 109 - n_type: 0x24 - n_sect: 1 - n_desc: 0 - n_value: 4294983576 - - n_strx: 1 - n_type: 0x24 - n_sect: 0 - n_desc: 0 - n_value: 4 - - n_strx: 1 - n_type: 0x4E - n_sect: 1 - n_desc: 0 - n_value: 4 - - n_strx: 1 - n_type: 0x2E - n_sect: 1 - n_desc: 0 - n_value: 4294983580 - - n_strx: 114 - n_type: 0x24 - n_sect: 1 - n_desc: 0 - n_value: 4294983580 - - n_strx: 1 - n_type: 0x24 - n_sect: 0 - n_desc: 0 - n_value: 20 - - n_strx: 1 - n_type: 0x4E - n_sect: 1 - n_desc: 0 - n_value: 20 - - n_strx: 1 - n_type: 0x64 - n_sect: 1 - n_desc: 0 - n_value: 0 - - n_strx: 2 - n_type: 0xF - n_sect: 1 - n_desc: 16 - n_value: 4294967296 - - n_strx: 22 - n_type: 0xF - n_sect: 1 - n_desc: 0 - n_value: 4294983576 - - n_strx: 27 - n_type: 0xF - n_sect: 1 - n_desc: 0 - n_value: 4294983580 - StringTable: - - ' ' - - __mh_execute_header - - _foo - - _main - - '/tmp/' - - main.c - - '/var/folders/gj/wf3swl0x215b2sq1qy84kzkm0000gn/T/main-e32fe7.o' - - _foo - - _main +__LINKEDIT: 00000000200000002C0000002C000000000000000100000000000000000000000200000000000000000000000000000000015F666F6F000804008080010000000000000000000000020000000F010000004000000000000020005F666F6F0000 ... diff --git a/llvm/test/tools/llvm-reduce/simplify-cfg.ll b/llvm/test/tools/llvm-reduce/simplify-cfg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/simplify-cfg.ll @@ -0,0 +1,33 @@ +; RUN: llvm-reduce --delta-passes=simplify-cfg --test %python --test-arg %p/Inputs/remove-bbs.py -abort-on-invalid-reduction %s -o %t + +; RUN: FileCheck --check-prefix=CHECK-FINAL %s --input-file=%t +; CHECK-FINAL: @f1 +; CHECK-FINAL-NOT: x6: +; CHECK-FINAL-NOT: x10: + +define void @f1(ptr %interesting3, i32 %interesting2) { + %x3 = alloca ptr, i32 0, align 8 + store ptr %interesting3, ptr %interesting3, align 8 + switch i32 %interesting2, label %interesting1 [ + i32 0, label %x6 + i32 1, label %x11 + ] + +x4: + %x5 = call ptr @f2() + br label %x10 + +x10: + br label %interesting1 + +x6: + br label %x11 + +x11: + br label %interesting1 + +interesting1: + ret void +} + +declare ptr @f2() diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -722,8 +722,8 @@ // Returns true if S is valid as a C language identifier. static bool isValidCIdentifier(StringRef S) { return !S.empty() && (isAlpha(S[0]) || S[0] == '_') && - std::all_of(S.begin() + 1, S.end(), - [](char C) { return C == '_' || isAlnum(C); }); + llvm::all_of(llvm::drop_begin(S), + [](char C) { return C == '_' || isAlnum(C); }); } static bool isUndefined(ld_plugin_symbol &Sym) { diff --git a/llvm/tools/llvm-objdump/MachODump.h b/llvm/tools/llvm-objdump/MachODump.h --- a/llvm/tools/llvm-objdump/MachODump.h +++ b/llvm/tools/llvm-objdump/MachODump.h @@ -36,6 +36,7 @@ extern bool Bind; extern bool DataInCode; extern std::string DisSymName; +extern bool ChainedFixups; extern bool DyldInfo; extern bool DylibId; extern bool DylibsUsed; diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp --- a/llvm/tools/llvm-objdump/MachODump.cpp +++ b/llvm/tools/llvm-objdump/MachODump.cpp @@ -81,6 +81,7 @@ bool objdump::FunctionStarts; bool objdump::LinkOptHints; bool objdump::InfoPlist; +bool objdump::ChainedFixups; bool objdump::DyldInfo; bool objdump::DylibsUsed; bool objdump::DylibId; @@ -112,6 +113,7 @@ FunctionStarts = InputArgs.hasArg(OBJDUMP_function_starts); LinkOptHints = InputArgs.hasArg(OBJDUMP_link_opt_hints); InfoPlist = InputArgs.hasArg(OBJDUMP_info_plist); + ChainedFixups = InputArgs.hasArg(OBJDUMP_chained_fixups); DyldInfo = InputArgs.hasArg(OBJDUMP_dyld_info); DylibsUsed = InputArgs.hasArg(OBJDUMP_dylibs_used); DylibId = InputArgs.hasArg(OBJDUMP_dylib_id); @@ -1193,6 +1195,48 @@ reportError(std::move(Err), Obj->getFileName()); } +static void +PrintChainedFixupsHeader(const MachO::dyld_chained_fixups_header &H) { + outs() << "chained fixups header (LC_DYLD_CHAINED_FIXUPS)\n"; + outs() << " fixups_version = " << H.fixups_version << '\n'; + outs() << " starts_offset = " << H.starts_offset << '\n'; + outs() << " imports_offset = " << H.imports_offset << '\n'; + outs() << " symbols_offset = " << H.symbols_offset << '\n'; + outs() << " imports_count = " << H.imports_count << '\n'; + + outs() << " imports_format = " << H.imports_format; + switch (H.imports_format) { + case llvm::MachO::DYLD_CHAINED_IMPORT: + outs() << " (DYLD_CHAINED_IMPORT)"; + break; + case llvm::MachO::DYLD_CHAINED_IMPORT_ADDEND: + outs() << " (DYLD_CHAINED_IMPORT_ADDEND)"; + break; + case llvm::MachO::DYLD_CHAINED_IMPORT_ADDEND64: + outs() << " (DYLD_CHAINED_IMPORT_ADDEND64)"; + break; + } + outs() << '\n'; + + outs() << " symbols_format = " << H.symbols_format; + if (H.symbols_format == llvm::MachO::DYLD_CHAINED_SYMBOL_ZLIB) + outs() << " (zlib compressed)"; + outs() << '\n'; +} + +static void PrintChainedFixups(MachOObjectFile *O) { + // MachOObjectFile::getChainedFixupsHeader() reads LC_DYLD_CHAINED_FIXUPS. + // FIXME: Support chained fixups in __TEXT,__chain_starts section too. + auto ChainedFixupHeader = + unwrapOrError(O->getChainedFixupsHeader(), O->getFileName()); + if (!ChainedFixupHeader) + return; + + PrintChainedFixupsHeader(*ChainedFixupHeader); + + // FIXME: Print more things. +} + static void PrintDyldInfo(MachOObjectFile *O) { outs() << "dyld information:" << '\n'; printMachOChainedFixups(O); @@ -1916,8 +1960,9 @@ // UniversalHeaders or ArchiveHeaders. if (Disassemble || Relocations || PrivateHeaders || ExportsTrie || Rebase || Bind || SymbolTable || LazyBind || WeakBind || IndirectSymbols || - DataInCode || FunctionStarts || LinkOptHints || DyldInfo || DylibsUsed || - DylibId || Rpaths || ObjcMetaData || (!FilterSections.empty())) { + DataInCode || FunctionStarts || LinkOptHints || ChainedFixups || + DyldInfo || DylibsUsed || DylibId || Rpaths || ObjcMetaData || + (!FilterSections.empty())) { if (LeadingHeaders) { outs() << Name; if (!ArchiveMemberName.empty()) @@ -1986,6 +2031,8 @@ DumpSectionContents(FileName, MachOOF, Verbose); if (InfoPlist) DumpInfoPlistSectionContents(FileName, MachOOF); + if (ChainedFixups) + PrintChainedFixups(MachOOF); if (DyldInfo) PrintDyldInfo(MachOOF); if (DylibsUsed) diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td --- a/llvm/tools/llvm-objdump/ObjdumpOpts.td +++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td @@ -299,11 +299,15 @@ "Mach-O objects (requires --macho)">, Group; +def chained_fixups : Flag<["--"], "chained-fixups">, + HelpText<"Print chained fixup information (requires --macho)">, + Group; + def dyld_info : Flag<["--"], "dyld_info">, - HelpText<"Print bind and rebase information used by dyld to resolve " - "external references in a final linked binary " - "(requires --macho)">, - Group; + HelpText<"Print bind and rebase information used by dyld to resolve " + "external references in a final linked binary " + "(requires --macho)">, + Group; def dylibs_used : Flag<["--"], "dylibs-used">, HelpText<"Print the shared libraries used for linked " diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td --- a/llvm/tools/llvm-objdump/OtoolOpts.td +++ b/llvm/tools/llvm-objdump/OtoolOpts.td @@ -37,13 +37,15 @@ def x : Flag<["-"], "x">, HelpText<"print all text sections">; def X : Flag<["-"], "X">, HelpText<"omit leading addresses or headers">; +def chained_fixups : Flag<["-"], "chained_fixups">, + HelpText<"print chained fixup information">; + // Not (yet?) implemented: // def a : Flag<["-"], "a">, HelpText<"print archive header">; // -c print argument strings of a core file // -m don't use archive(member) syntax // -dyld_info // -dyld_opcodes -// -chained_fixups // -addr_slide=arg // -function_offsets diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -2787,6 +2787,8 @@ FilterSections.push_back(",__text"); LeadingAddr = LeadingHeaders = !InputArgs.hasArg(OTOOL_X); + ChainedFixups = InputArgs.hasArg(OTOOL_chained_fixups); + InputFilenames = InputArgs.getAllArgValues(OTOOL_INPUT); if (InputFilenames.empty()) reportCmdLineError("no input file"); @@ -2990,11 +2992,12 @@ !DynamicRelocations && !FileHeaders && !PrivateHeaders && !RawClangAST && !Relocations && !SectionHeaders && !SectionContents && !SymbolTable && !DynamicSymbolTable && !UnwindInfo && !FaultMapSection && !Offloading && - !(MachOOpt && (Bind || DataInCode || DyldInfo || DylibId || DylibsUsed || - ExportsTrie || FirstPrivateHeader || FunctionStarts || - IndirectSymbols || InfoPlist || LazyBind || LinkOptHints || - ObjcMetaData || Rebase || Rpaths || UniversalHeaders || - WeakBind || !FilterSections.empty()))) { + !(MachOOpt && + (Bind || DataInCode || ChainedFixups || DyldInfo || DylibId || + DylibsUsed || ExportsTrie || FirstPrivateHeader || FunctionStarts || + IndirectSymbols || InfoPlist || LazyBind || LinkOptHints || + ObjcMetaData || Rebase || Rpaths || UniversalHeaders || WeakBind || + !FilterSections.empty()))) { T->printHelp(ToolName); return 2; } diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt --- a/llvm/tools/llvm-reduce/CMakeLists.txt +++ b/llvm/tools/llvm-reduce/CMakeLists.txt @@ -49,6 +49,7 @@ deltas/ReduceRegisterMasks.cpp deltas/ReduceRegisterDefs.cpp deltas/ReduceRegisterUses.cpp + deltas/ReduceUsingSimplifyCFG.cpp deltas/RunIRPasses.cpp deltas/SimplifyInstructions.cpp llvm-reduce.cpp diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp --- a/llvm/tools/llvm-reduce/DeltaManager.cpp +++ b/llvm/tools/llvm-reduce/DeltaManager.cpp @@ -39,6 +39,7 @@ #include "deltas/ReduceRegisterMasks.h" #include "deltas/ReduceRegisterUses.h" #include "deltas/ReduceSpecialGlobals.h" +#include "deltas/ReduceUsingSimplifyCFG.h" #include "deltas/ReduceVirtualRegisters.h" #include "deltas/RunIRPasses.h" #include "deltas/SimplifyInstructions.h" @@ -75,6 +76,7 @@ DELTA_PASS("operands-to-args", reduceOperandsToArgsDeltaPass) \ DELTA_PASS("operands-skip", reduceOperandsSkipDeltaPass) \ DELTA_PASS("operand-bundles", reduceOperandBundesDeltaPass) \ + DELTA_PASS("simplify-cfg", reduceUsingSimplifyCFGDeltaPass) \ DELTA_PASS("attributes", reduceAttributesDeltaPass) \ DELTA_PASS("module-data", reduceModuleDataDeltaPass) \ } while (false) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h @@ -0,0 +1,23 @@ +//===- ReduceUsingSimplifyCFG.h - Specialized Delta Pass ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function which calls the Generic Delta pass in order +// to call SimplifyCFG on individual basic blocks. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_SIMPLIFYCFG_H +#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_SIMPLIFYCFG_H + +#include "Delta.h" + +namespace llvm { +void reduceUsingSimplifyCFGDeltaPass(TestRunner &Test); +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp new file mode 100644 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp @@ -0,0 +1,34 @@ +//===- ReduceUsingSimplifyCFG.h - Specialized Delta Pass ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function which calls the Generic Delta pass in order +// to call SimplifyCFG on individual basic blocks. +// +//===----------------------------------------------------------------------===// + +#include "ReduceUsingSimplifyCFG.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +static void reduceUsingSimplifyCFG(Oracle &O, Module &Program) { + SmallVector ToSimplify; + for (auto &F : Program) + for (auto &BB : F) + if (!O.shouldKeep()) + ToSimplify.push_back(&BB); + TargetTransformInfo TTI(Program.getDataLayout()); + for (auto *BB : ToSimplify) + simplifyCFG(BB, TTI); +} + +void llvm::reduceUsingSimplifyCFGDeltaPass(TestRunner &Test) { + outs() << "*** Reducing using SimplifyCFG...\n"; + runDeltaPass(Test, reduceUsingSimplifyCFG); +} diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp --- a/llvm/tools/llvm-reduce/llvm-reduce.cpp +++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp @@ -17,27 +17,19 @@ #include "DeltaManager.h" #include "ReducerWorkItem.h" #include "TestRunner.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" -#include "llvm/ADT/SmallString.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/CodeGen/CommandFlags.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" -#include "llvm/MC/TargetRegistry.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetSelect.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" #include #include @@ -117,9 +109,19 @@ void writeBitcode(ReducerWorkItem &M, llvm::raw_ostream &OutStream) { if (M.LTOInfo && M.LTOInfo->IsThinLTO && M.LTOInfo->EnableSplitLTOUnit) { - legacy::PassManager PM; - PM.add(llvm::createWriteThinLTOBitcodePass(OutStream)); - PM.run(*(M.M)); + PassBuilder PB; + LoopAnalysisManager LAM; + FunctionAnalysisManager FAM; + CGSCCAnalysisManager CGAM; + ModuleAnalysisManager MAM; + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + ModulePassManager MPM; + MPM.addPass(ThinLTOBitcodeWriterPass(OutStream, nullptr)); + MPM.run(*M.M, MAM); } else { std::unique_ptr Index; if (M.LTOInfo && M.LTOInfo->HasSummary) { diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -970,8 +970,8 @@ report_fatal_error("Text output is incompatible with -module-hash"); Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder)); } else if (OutputThinLTOBC) - Passes.add(createWriteThinLTOBitcodePass( - *OS, ThinLinkOut ? &ThinLinkOut->os() : nullptr)); + report_fatal_error( + "Use the new pass manager for printing ThinLTO bitcode"); else Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash)); diff --git a/llvm/unittests/ADT/SmallSetTest.cpp b/llvm/unittests/ADT/SmallSetTest.cpp --- a/llvm/unittests/ADT/SmallSetTest.cpp +++ b/llvm/unittests/ADT/SmallSetTest.cpp @@ -21,11 +21,17 @@ SmallSet s1; - for (int i = 0; i < 4; i++) - s1.insert(i); + for (int i = 0; i < 4; i++) { + auto InsertResult = s1.insert(i); + EXPECT_EQ(*InsertResult.first, i); + EXPECT_EQ(InsertResult.second, true); + } - for (int i = 0; i < 4; i++) - s1.insert(i); + for (int i = 0; i < 4; i++) { + auto InsertResult = s1.insert(i); + EXPECT_EQ(*InsertResult.first, i); + EXPECT_EQ(InsertResult.second, false); + } EXPECT_EQ(4u, s1.size()); @@ -38,8 +44,17 @@ TEST(SmallSetTest, Grow) { SmallSet s1; - for (int i = 0; i < 8; i++) - s1.insert(i); + for (int i = 0; i < 8; i++) { + auto InsertResult = s1.insert(i); + EXPECT_EQ(*InsertResult.first, i); + EXPECT_EQ(InsertResult.second, true); + } + + for (int i = 0; i < 8; i++) { + auto InsertResult = s1.insert(i); + EXPECT_EQ(*InsertResult.first, i); + EXPECT_EQ(InsertResult.second, false); + } EXPECT_EQ(8u, s1.size()); diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp --- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp +++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp @@ -361,7 +361,7 @@ char32_t Codepoint = Entry.first; const std::string &Name = Entry.second; // Ignore names which are not valid. - if (Name.empty() || !std::all_of(Name.begin(), Name.end(), [](char C) { + if (Name.empty() || !llvm::all_of(Name, [](char C) { return llvm::is_contained(Letters, C); })) { continue; diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn --- a/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/BUILD.gn @@ -7,7 +7,10 @@ deps += [ "//compiler-rt/lib/msan" ] } if (current_os == "linux" || current_os == "android") { - deps += [ "//compiler-rt/lib/ubsan_minimal" ] + deps += [ + "//compiler-rt/lib/ubsan", + "//compiler-rt/lib/ubsan_minimal", + ] } if (current_os != "win" && current_os != "baremetal") { deps += [ "//compiler-rt/lib/asan" ] diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn --- a/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/ubsan/BUILD.gn @@ -1,3 +1,12 @@ +import("//compiler-rt/target.gni") + +group("ubsan") { + deps = [ + ":ubsan_standalone", + ":ubsan_standalone_cxx", + ] +} + source_set("sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs += [ "//llvm/utils/gn/build:crt_code" ] @@ -46,7 +55,6 @@ sources = [ "ubsan_win_dynamic_runtime_thunk.cpp" ] } -# Unreferenced; at the moment exists to make sync_source_lists_from_cmake happy. source_set("standalone_sources") { configs -= [ "//llvm/utils/gn/build:llvm_code" ] configs -= [ "//llvm/utils/gn/build:no_rtti" ] @@ -57,6 +65,11 @@ "ubsan_init_standalone.cpp", "ubsan_signals_standalone.cpp", ] + deps = [ + ":sources", + "//compiler-rt/lib/interception:sources", + "//compiler-rt/lib/sanitizer_common:sources", + ] } source_set("cxx_sources") { @@ -72,3 +85,34 @@ "ubsan_type_hash_win.cpp", ] } + +# FIXME: Make ubsan_standalone work on mac. +if (current_os != "mac") { + static_library("ubsan_standalone") { + output_dir = crt_current_out_dir + output_name = "clang_rt.ubsan_standalone$crt_current_target_suffix" + complete_static_lib = true + configs -= [ + "//llvm/utils/gn/build:llvm_code", + "//llvm/utils/gn/build:thin_archive", + ] + deps = [ + ":sources", + ":standalone_sources", + ] + configs += [ "//llvm/utils/gn/build:crt_code" ] + sources = [ "ubsan_init_standalone_preinit.cpp" ] + } + + static_library("ubsan_standalone_cxx") { + output_dir = crt_current_out_dir + output_name = "clang_rt.ubsan_standalone_cxx$crt_current_target_suffix" + complete_static_lib = true + configs -= [ + "//llvm/utils/gn/build:llvm_code", + "//llvm/utils/gn/build:thin_archive", + ] + deps = [ ":cxx_sources" ] + configs += [ "//llvm/utils/gn/build:crt_code" ] + } +} diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn @@ -25,6 +25,7 @@ "ELF.cpp", "ELFLinkGraphBuilder.cpp", "ELF_aarch64.cpp", + "ELF_i386.cpp", "ELF_riscv.cpp", "ELF_x86_64.cpp", "JITLink.cpp", @@ -36,6 +37,7 @@ "MachO_x86_64.cpp", "MemoryFlags.cpp", "aarch64.cpp", + "i386.cpp", "riscv.cpp", "x86_64.cpp", ] diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn @@ -40,6 +40,7 @@ "deltas/ReduceRegisterMasks.cpp", "deltas/ReduceRegisterUses.cpp", "deltas/ReduceSpecialGlobals.cpp", + "deltas/ReduceUsingSimplifyCFG.cpp", "deltas/ReduceVirtualRegisters.cpp", "deltas/RunIRPasses.cpp", "deltas/SimplifyInstructions.cpp", diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh --- a/llvm/utils/release/test-release.sh +++ b/llvm/utils/release/test-release.sh @@ -35,6 +35,7 @@ do_libunwind="yes" do_test_suite="yes" do_openmp="yes" +do_bolt="no" do_lld="yes" do_lldb="yes" do_polly="yes" @@ -163,6 +164,12 @@ -no-openmp ) do_openmp="no" ;; + -bolt ) + do_bolt="yes" + ;; + -no-bolt ) + do_bolt="no" + ;; -no-lld ) do_lld="no" ;; @@ -265,6 +272,9 @@ if [ $do_openmp = "yes" ]; then projects="$projects openmp" fi +if [ $do_bolt = "yes" ]; then + projects="$projects bolt" +fi if [ $do_lld = "yes" ]; then projects="$projects lld" fi diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake --- a/mlir/cmake/modules/AddMLIR.cmake +++ b/mlir/cmake/modules/AddMLIR.cmake @@ -159,7 +159,7 @@ " filepath: \"${LLVM_TARGET_DEFINITIONS_ABSOLUTE}\"\n" " includes: \"${CMAKE_CURRENT_SOURCE_DIR};${tblgen_includes}\"\n" ) - + add_public_tablegen_target(${target}) endfunction() @@ -490,6 +490,17 @@ ${ARG_PUBLIC_LIBS} ) target_sources(${name} PRIVATE ${_objects}) + + # Linux defaults to allowing undefined symbols in shared libraries whereas + # many other platforms are more strict. We want these libraries to be + # self contained, and we want any undefined symbols to be reported at + # library construction time, not at library use, so make Linux strict too. + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_options(${name} PRIVATE + "LINKER:-z,defs" + ) + endif() + # TODO: Should be transitive. set_target_properties(${name} PROPERTIES MLIR_AGGREGATE_EXCLUDE_LIBS "${_embed_libs}") diff --git a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h --- a/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DeadCodeAnalysis.h @@ -41,9 +41,6 @@ /// The state is initialized by default. bool isUninitialized() const override { return false; } - /// The state is always initialized. - ChangeResult defaultInitialize() override { return ChangeResult::NoChange; } - /// Set the state of the program point to live. ChangeResult setToLive(); @@ -101,9 +98,6 @@ /// The state is initialized by default. bool isUninitialized() const override { return false; } - /// The state is always initialized. - ChangeResult defaultInitialize() override { return ChangeResult::NoChange; } - /// Print the known predecessors. void print(raw_ostream &os) const override; diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h --- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -42,10 +42,6 @@ /// Reset the dense lattice to a pessimistic value. This occurs when the /// analysis cannot reason about the data-flow. virtual ChangeResult reset() = 0; - - /// Returns true if the lattice state has reached a pessimistic fixpoint. That - /// is, no further modifications to the lattice can occur. - virtual bool isAtFixpoint() const = 0; }; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h --- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h @@ -38,10 +38,6 @@ /// if the value of the lattice changed. virtual ChangeResult join(const AbstractSparseLattice &rhs) = 0; - /// Returns true if the lattice element is at fixpoint and further calls to - /// `join` will not update the value of the element. - virtual bool isAtFixpoint() const = 0; - /// Mark the lattice element as having reached a pessimistic fixpoint. This /// means that the lattice may potentially have conflicting value states, and /// only the most conservative value should be relied on. @@ -97,16 +93,6 @@ /// Returns true if the value of this lattice hasn't yet been initialized. bool isUninitialized() const override { return !optimisticValue.has_value(); } - /// Force the initialization of the element by setting it to its pessimistic - /// fixpoint. - ChangeResult defaultInitialize() override { - return markPessimisticFixpoint(); - } - - /// Returns true if the lattice has reached a fixpoint. A fixpoint is when - /// the information optimistically assumed to be true is the same as the - /// information known to be true. - bool isAtFixpoint() const override { return optimisticValue == knownValue; } /// Join the information contained in the 'rhs' lattice into this /// lattice. Returns if the state of the current lattice changed. @@ -114,8 +100,8 @@ const Lattice &rhsLattice = static_cast &>(rhs); - // If we are at a fixpoint, or rhs is uninitialized, there is nothing to do. - if (isAtFixpoint() || rhsLattice.isUninitialized()) + // If rhs is uninitialized, there is nothing to do. + if (rhsLattice.isUninitialized()) return ChangeResult::NoChange; // Join the rhs value into this lattice. @@ -150,7 +136,7 @@ /// means that the lattice may potentially have conflicting value states, /// and only the conservatively known value state should be relied on. ChangeResult markPessimisticFixpoint() override { - if (isAtFixpoint()) + if (optimisticValue == knownValue) return ChangeResult::NoChange; // For this fixed point, we take whatever we knew to be true and set that diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h --- a/mlir/include/mlir/Analysis/DataFlowFramework.h +++ b/mlir/include/mlir/Analysis/DataFlowFramework.h @@ -291,10 +291,6 @@ /// Returns true if the analysis state is uninitialized. virtual bool isUninitialized() const = 0; - /// Force an uninitialized analysis state to initialize itself with a default - /// value. - virtual ChangeResult defaultInitialize() = 0; - /// Print the contents of the analysis state. virtual void print(raw_ostream &os) const = 0; diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -369,7 +369,10 @@ /// to /// /// %iv = %lb + %procId * %step - CyclicNumProcsEqNumIters = 2 + CyclicNumProcsEqNumIters = 2, + + /// No Distribution. + None = 3 }; /// Callback function type used to get processor ID, and number of processors @@ -377,11 +380,10 @@ struct ProcInfo { Value procId; Value nprocs; + DistributionMethod distributionMethod; }; -using ProcInfoCallBackFn = std::function( +using ProcInfoCallBackFn = std::function( OpBuilder &b, Location loc, ArrayRef parallelLoopRanges)>; -using OneDimProcInfoCallBackFn = - std::function; /// Options that allow distribution of loops generated in Linalg transforms to /// processors while generating the loops. @@ -389,21 +391,10 @@ /// Callback function that returns the Values for processor ID (`procId`), and /// number of processors (`nprocs`) used to execute the parallel loops. The /// number of `{procId, nprocs}` pairs returned must be equal to the number of - /// `parallelLoopRanges` passed into the callback, which in-turn is same as - /// the number of parallel loops for which the `distributionMethod` is - /// specified below. + /// `parallelLoopRanges` passed into the callback. The `parallelLoopRanges` + /// are ranges of the outer parallel loops of the operation that + /// do have non-zero tile sizes specified. ProcInfoCallBackFn procInfo; - /// Specification of how to distribute the `scf.parallel` loops that are - /// generated. As the `scf.parallel` loop is generated, the elements of this - /// vector is used (from left to right) and the specified distribution is - /// applied. If the vector is less than the number of `scf.parallel` loops - /// generated, then no distribution is applied. - SmallVector distributionMethod = {}; - - /// The map keyed by the distribution type that contains callback functions - /// that return the Values for processor ID (`procId`), and number of - /// processors (`nprocs`) used to execute the parallel loops. - DenseMap procInfoMap; }; /// Update the `lb`, `ub` and `step` to get per processor `lb`, `ub` and `step`. @@ -521,8 +512,7 @@ function_ref bodyBuilderFn, - Optional = None, - ArrayRef distributionTypes = {}); + ArrayRef procInfo = {}); }; } // namespace linalg diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td --- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td +++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td @@ -538,6 +538,7 @@ %a = math.ipowi %b, %c : i32 ``` }]; + let hasFolder = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td @@ -27,12 +27,12 @@ // In addition to normal types arithmetic instructions can support cooperative // matrix. let arguments = (ins - SPV_ScalarOrVectorOrCoopMatrixOf:$operand1, - SPV_ScalarOrVectorOrCoopMatrixOf:$operand2 + SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf:$operand1, + SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf:$operand2 ); let results = (outs - SPV_ScalarOrVectorOrCoopMatrixOf:$result + SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf:$result ); let assemblyFormat = "operands attr-dict `:` type($result)"; } diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td @@ -64,6 +64,27 @@ TypedArrayAttrBase; +// Description of the supported joint matrix operations. See +// https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc +def SPV_JointMatrixPropertiesINTELAttr : + SPV_Attr<"JointMatrixPropertiesINTEL", "joint_matrix_props"> { + let parameters = (ins + "int":$m_size, + "int":$n_size, + "int":$k_size, + "mlir::Type":$a_type, + "mlir::Type":$b_type, + "mlir::Type":$c_type, + "mlir::Type":$result_type, + "mlir::spirv::ScopeAttr":$scope + ); + let assemblyFormat = "`<` struct(params) `>`"; +} + +def SPV_JointMatrixPropertiesINTELArrayAttr : + TypedArrayAttrBase; + // This attribute specifies the limits for various resources on the target // architecture. // diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -387,6 +387,7 @@ def SPV_INTEL_fp_fast_math_mode : I32EnumAttrCase<"SPV_INTEL_fp_fast_math_mode", 4027>; def SPV_INTEL_memory_access_aliasing : I32EnumAttrCase<"SPV_INTEL_memory_access_aliasing", 4028>; def SPV_INTEL_split_barrier : I32EnumAttrCase<"SPV_INTEL_split_barrier", 4029>; +def SPV_INTEL_joint_matrix : I32EnumAttrCase<"SPV_INTEL_joint_matrix", 4030>; def SPV_NV_compute_shader_derivatives : I32EnumAttrCase<"SPV_NV_compute_shader_derivatives", 5000>; def SPV_NV_cooperative_matrix : I32EnumAttrCase<"SPV_NV_cooperative_matrix", 5001>; @@ -443,7 +444,7 @@ SPV_INTEL_usm_storage_classes, SPV_INTEL_io_pipes, SPV_INTEL_blocking_pipes, SPV_INTEL_fpga_reg, SPV_INTEL_long_constant_composite, SPV_INTEL_optnone, SPV_INTEL_debug_module, SPV_INTEL_fp_fast_math_mode, - SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier, + SPV_INTEL_memory_access_aliasing, SPV_INTEL_split_barrier, SPV_INTEL_joint_matrix, SPV_NV_compute_shader_derivatives, SPV_NV_cooperative_matrix, SPV_NV_fragment_shader_barycentric, SPV_NV_geometry_shader_passthrough, SPV_NV_mesh_shader, SPV_NV_ray_tracing, SPV_NV_sample_mask_override_coverage, @@ -1390,6 +1391,12 @@ ]; } +def SPV_C_JointMatrixINTEL : I32EnumAttrCase<"JointMatrixINTEL", 6118> { + list availability = [ + Extension<[SPV_INTEL_joint_matrix]> + ]; +} + def SPV_CapabilityAttr : SPV_I32EnumAttr<"Capability", "valid SPIR-V Capability", "capability", [ SPV_C_Matrix, SPV_C_Addresses, SPV_C_Linkage, SPV_C_Kernel, SPV_C_Float16, @@ -1481,7 +1488,7 @@ SPV_C_UniformTexelBufferArrayNonUniformIndexing, SPV_C_StorageTexelBufferArrayNonUniformIndexing, SPV_C_ShaderViewportIndexLayerEXT, SPV_C_ShaderViewportMaskNV, - SPV_C_ShaderStereoViewNV + SPV_C_ShaderStereoViewNV, SPV_C_JointMatrixINTEL ]>; def SPV_AM_Logical : I32EnumAttrCase<"Logical", 0>; @@ -3981,6 +3988,16 @@ "image_sampler_use_info", [SPV_ISUI_SamplerUnknown, SPV_ISUI_NeedSampler, SPV_ISUI_NoSampler]>; +def SPV_ML_ColumnMajor : I32EnumAttrCase<"ColumnMajor", 0>; +def SPV_ML_RowMajor : I32EnumAttrCase<"RowMajor", 1>; +def SPV_ML_PackedA : I32EnumAttrCase<"PackedA", 2>; +def SPV_ML_PackedB : I32EnumAttrCase<"PackedB", 3>; + +def SPV_MatrixLayoutAttr : + SPV_I32EnumAttr<"MatrixLayout", "valid SPIR-V MatrixLayout", "matrixLayout", [ + SPV_ML_ColumnMajor, SPV_ML_RowMajor, SPV_ML_PackedA, SPV_ML_PackedB + ]>; + //===----------------------------------------------------------------------===// // SPIR-V attribute definitions //===----------------------------------------------------------------------===// @@ -4013,6 +4030,8 @@ def SPV_IsCooperativeMatrixType : CPred<"$_self.isa<::mlir::spirv::CooperativeMatrixNVType>()">; def SPV_IsImageType : CPred<"$_self.isa<::mlir::spirv::ImageType>()">; +def SPV_IsJointMatrixType : + CPred<"$_self.isa<::mlir::spirv::JointMatrixINTELType>()">; def SPV_IsMatrixType : CPred<"$_self.isa<::mlir::spirv::MatrixType>()">; def SPV_IsPtrType : CPred<"$_self.isa<::mlir::spirv::PointerType>()">; def SPV_IsRTArrayType : CPred<"$_self.isa<::mlir::spirv::RuntimeArrayType>()">; @@ -4043,6 +4062,8 @@ "any SPIR-V cooperative matrix type">; def SPV_AnyImage : DialectType; +def SPV_AnyJointMatrix : DialectType; def SPV_AnyMatrix : DialectType; def SPV_AnyRTArray : DialectType; def SPV_Composite : AnyTypeOf<[SPV_Vector, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct, - SPV_AnyCooperativeMatrix, SPV_AnyMatrix]>; + SPV_AnyCooperativeMatrix, SPV_AnyJointMatrix, SPV_AnyMatrix]>; def SPV_Type : AnyTypeOf<[ SPV_Void, SPV_Bool, SPV_Integer, SPV_Float, SPV_Vector, SPV_AnyPtr, SPV_AnyArray, SPV_AnyRTArray, SPV_AnyStruct, - SPV_AnyCooperativeMatrix, SPV_AnyMatrix, SPV_AnySampledImage + SPV_AnyCooperativeMatrix, SPV_AnyJointMatrix, SPV_AnyMatrix, + SPV_AnySampledImage ]>; def SPV_SignedInt : SignedIntOfWidths<[8, 16, 32, 64]>; @@ -4072,6 +4094,11 @@ "$_self.cast<::mlir::spirv::CooperativeMatrixNVType>().getElementType()", "Cooperative Matrix">; +class SPV_JointMatrixOfType allowedTypes> : + ContainerType, SPV_IsJointMatrixType, + "$_self.cast<::mlir::spirv::JointMatrixINTELType>().getElementType()", + "Joint Matrix">; + class SPV_ScalarOrVectorOf : AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>]>; @@ -4079,6 +4106,14 @@ AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>, SPV_CoopMatrixOfType<[type]>]>; +class SPV_ScalarOrVectorOrJointMatrixOf : + AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>, + SPV_JointMatrixOfType<[type]>]>; + +class SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf : + AnyTypeOf<[type, VectorOfLengthAndType<[2, 3, 4, 8, 16], [type]>, + SPV_CoopMatrixOfType<[type]>, SPV_JointMatrixOfType<[type]> ]>; + def SPV_ScalarOrVector : AnyTypeOf<[SPV_Scalar, SPV_Vector]>; def SPV_ScalarOrVectorOrPtr : AnyTypeOf<[SPV_ScalarOrVector, SPV_AnyPtr]>; @@ -4311,6 +4346,11 @@ def SPV_OC_OpSubgroupBlockWriteINTEL : I32EnumAttrCase<"OpSubgroupBlockWriteINTEL", 5576>; def SPV_OC_OpAssumeTrueKHR : I32EnumAttrCase<"OpAssumeTrueKHR", 5630>; def SPV_OC_OpAtomicFAddEXT : I32EnumAttrCase<"OpAtomicFAddEXT", 6035>; +def SPV_OC_OpTypeJointMatrixINTEL : I32EnumAttrCase<"OpTypeJointMatrixINTEL", 6119>; +def SPV_OC_OpJointMatrixLoadINTEL : I32EnumAttrCase<"OpJointMatrixLoadINTEL", 6120>; +def SPV_OC_OpJointMatrixStoreINTEL : I32EnumAttrCase<"OpJointMatrixStoreINTEL", 6121>; +def SPV_OC_OpJointMatrixMadINTEL : I32EnumAttrCase<"OpJointMatrixMadINTEL", 6122>; +def SPV_OC_OpTypejointMatrixWorkItemLengthINTEL : I32EnumAttrCase<"OpJointMatrixWorkItemLengthINTEL", 6410>; def SPV_OpcodeAttr : SPV_I32EnumAttr<"Opcode", "valid SPIR-V instructions", "opcode", [ @@ -4376,7 +4416,10 @@ SPV_OC_OpCooperativeMatrixLoadNV, SPV_OC_OpCooperativeMatrixStoreNV, SPV_OC_OpCooperativeMatrixMulAddNV, SPV_OC_OpCooperativeMatrixLengthNV, SPV_OC_OpSubgroupBlockReadINTEL, SPV_OC_OpSubgroupBlockWriteINTEL, - SPV_OC_OpAssumeTrueKHR, SPV_OC_OpAtomicFAddEXT + SPV_OC_OpAssumeTrueKHR, SPV_OC_OpAtomicFAddEXT, + SPV_OC_OpTypeJointMatrixINTEL, SPV_OC_OpJointMatrixLoadINTEL, + SPV_OC_OpJointMatrixStoreINTEL, SPV_OC_OpJointMatrixMadINTEL, + SPV_OC_OpTypejointMatrixWorkItemLengthINTEL ]>; // End opcode section. Generated from SPIR-V spec; DO NOT MODIFY! diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td @@ -23,11 +23,11 @@ !listconcat(traits, [NoSideEffect, SameOperandsAndResultShape])> { let arguments = (ins - SPV_ScalarOrVectorOrCoopMatrixOf:$operand + SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf:$operand ); let results = (outs - SPV_ScalarOrVectorOrCoopMatrixOf:$result + SPV_ScalarOrVectorOrCoopMatrixOfOrJointMatrixOf:$result ); let assemblyFormat = [{ $operand attr-dict `:` type($operand) `to` type($result) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td @@ -0,0 +1,248 @@ +//===- SPIRVJointMatrixOps.td - joint matmul ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the op definition spec of joint matrix multiply extension ops. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS +#define MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS + +// ----- + +def SPV_JointMatrixWorkItemLengthINTELOp : SPV_Op<"JointMatrixWorkItemLengthINTEL", + [NoSideEffect]> { + let summary = "See extension SPV_INTEL_joint_matrix"; + + let description = [{ + Return number of components owned by the current work-item in + a joint matrix. + + Result Type must be an 32-bit unsigned integer type scalar. + + Type is a joint matrix type. + + ``` {.ebnf} + joint-matrix-length-op ::= ssa-id `=` `spv.JointMatrixWorkItemLengthINTEL + ` : ` joint-matrix-type + ``` + + For example: + + ``` + %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix + ``` + }]; + + let assemblyFormat = "attr-dict `:` $type"; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[SPV_INTEL_joint_matrix]>, + Capability<[SPV_C_JointMatrixINTEL]> + ]; + + let arguments = (ins + TypeAttr:$type + ); + + let results = (outs + SPV_Int32:$result + ); + let hasVerifier = 0; +} + +// ----- + +def SPV_JointMatrixLoadINTELOp : SPV_Op<"JointMatrixLoadINTEL", []> { + let summary = "See extension SPV_INTEL_joint_matrix"; + + let description = [{ + Load a matrix through a pointer. + + Result Type is the type of the loaded matrix. It must be OpTypeJointMatrixINTEL. + + Pointer is the pointer to load through. It specifies start of memory region where + elements of the matrix are stored and arranged according to Layout. + + Stride is the number of elements in memory between beginnings of successive rows, + columns (or words) in the result. It must be a scalar integer type. + + Layout indicates how the values loaded from memory are arranged. It must be the + result of a constant instruction. + + Scope is syncronization scope for operation on the matrix. It must be the result + of a constant instruction with scalar integer type. + + If present, any Memory Operands must begin with a memory operand literal. If not + present, it is the same as specifying the memory operand None. + + #### Example: + ```mlir + %0 = spv.JointMatrixLoadINTEL %ptr, %stride + {memory_access = #spv.memory_access} : + (!spv.ptr, i32) -> + !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup> + ``` + }]; + + let assemblyFormat = [{ + $scope $layout operands attr-dict `:` `(` type(operands) `)` `->` type($result) + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[SPV_INTEL_joint_matrix]>, + Capability<[SPV_C_JointMatrixINTEL]> + ]; + + let arguments = (ins + SPV_ScopeAttr:$scope, + SPV_MatrixLayoutAttr:$layout, + SPV_AnyPtr:$pointer, + SPV_Integer:$stride, + OptionalAttr:$memory_access, + OptionalAttr:$alignment + ); + + let results = (outs + SPV_AnyJointMatrix:$result + ); +} + +// ----- + +def SPV_JointMatrixMadINTELOp : SPV_Op<"JointMatrixMadINTEL", + [NoSideEffect, AllTypesMatch<["c", "result"]>]> { + let summary = "See extension SPV_INTEL_joint_matrix"; + + let description = [{ + Multiply matrix A by matrix B and add matrix C to the result + of the multiplication: A*B+C. Here A is a M x K matrix, B is + a K x N matrix and C is a M x N matrix. + + Behavior is undefined if sizes of operands do not meet the + conditions above. All operands and the Result Type must be + OpTypeJointMatrixINTEL. + + A must be a OpTypeJointMatrixINTEL whose Component Type is a + signed numerical type, Row Count equals to M and Column Count + equals to K + + B must be a OpTypeJointMatrixINTEL whose Component Type is a + signed numerical type, Row Count equals to K and Column Count + equals to N + + C and Result Type must be a OpTypeJointMatrixINTEL with Row + Count equals to M and Column Count equals to N + + Scope is syncronization scope for operation on the matrix. + It must be the result of a constant instruction with scalar + integer type. + + #### Example: + ```mlir + %r = spv.JointMatrixMadINTEL %a, %b, %c : + !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, + !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> + -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + ``` + + }]; + + let assemblyFormat = [{ + $scope operands attr-dict`:` type($a) `,` type($b) `->` type($c) + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[SPV_INTEL_joint_matrix]>, + Capability<[SPV_C_JointMatrixINTEL]> + ]; + + let arguments = (ins + SPV_ScopeAttr:$scope, + SPV_AnyJointMatrix:$a, + SPV_AnyJointMatrix:$b, + SPV_AnyJointMatrix:$c + ); + + let results = (outs + SPV_AnyJointMatrix:$result + ); +} + +// ----- + +def SPV_JointMatrixStoreINTELOp : SPV_Op<"JointMatrixStoreINTEL", []> { + let summary = "See extension SPV_INTEL_joint_matrix"; + + let description = [{ + Store a matrix through a pointer. + + Pointer is the pointer to store through. It specifies + start of memory region where elements of the matrix must + be stored and arranged according to Layout. + + Object is the matrix to store. It must be + OpTypeJointMatrixINTEL. + + Stride is the number of elements in memory between beginnings + of successive rows, columns (or words) of the Object. It must + be a scalar integer type. + + Layout indicates how the values stored to memory are arranged. + It must be the result of a constant instruction. + + Scope is syncronization scope for operation on the matrix. + It must be the result of a constant instruction with scalar + integer type. + + If present, any Memory Operands must begin with a memory operand + literal. If not present, it is the same as specifying the memory + operand None. + + #### Example: + ```mlir + spv.JointMatrixStoreINTEL %ptr, %m, %stride + {memory_access = #spv.memory_access} : (!spv.ptr, + !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32) + ``` + + }]; + + let assemblyFormat = [{ + $scope $layout operands attr-dict `:` `(` type(operands) `)` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[SPV_INTEL_joint_matrix]>, + Capability<[SPV_C_JointMatrixINTEL]> + ]; + + let arguments = (ins + SPV_ScopeAttr:$scope, + SPV_MatrixLayoutAttr:$layout, + SPV_AnyPtr:$pointer, + SPV_AnyJointMatrix:$object, + SPV_Integer:$stride, + OptionalAttr:$memory_access, + OptionalAttr:$alignment + ); + + let results = (outs); +} + +// ----- + +#endif // MLIR_DIALECT_SPIRV_IR_JOINT_MATRIX_OPS diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVOps.td @@ -30,6 +30,7 @@ include "mlir/Dialect/SPIRV/IR/SPIRVCompositeOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVCooperativeMatrixOps.td" +include "mlir/Dialect/SPIRV/IR/SPIRVJointMatrixOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVGLOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVGroupOps.td" include "mlir/Dialect/SPIRV/IR/SPIRVImageOps.td" diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h @@ -29,6 +29,7 @@ struct ArrayTypeStorage; struct CooperativeMatrixTypeStorage; struct ImageTypeStorage; +struct JointMatrixTypeStorage; struct MatrixTypeStorage; struct PointerTypeStorage; struct RuntimeArrayTypeStorage; @@ -420,6 +421,33 @@ Optional storage = llvm::None); }; +// SPIR-V joint matrix type +class JointMatrixINTELType + : public Type::TypeBase { +public: + using Base::Base; + + static JointMatrixINTELType get(Type elementType, Scope scope, unsigned rows, + unsigned columns, MatrixLayout matrixLayout); + Type getElementType() const; + + /// Return the scope of the joint matrix. + Scope getScope() const; + /// return the number of rows of the matrix. + unsigned getRows() const; + /// return the number of columns of the matrix. + unsigned getColumns() const; + + /// return the layout of the matrix + MatrixLayout getMatrixLayout() const; + + void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, + Optional storage = llvm::None); + void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, + Optional storage = llvm::None); +}; + // SPIR-V matrix type class MatrixType : public Type::TypeBase { diff --git a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h --- a/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h +++ b/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h @@ -36,6 +36,7 @@ #include #include #include +#include //===----------------------------------------------------------------------===// // Codegen-compatible structures for Vector type. @@ -209,13 +210,19 @@ template class StridedMemrefIterator { public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T *; + using reference = T &; + StridedMemrefIterator(StridedMemRefType &descriptor, int64_t offset = 0) - : offset(offset), descriptor(descriptor) {} + : offset(offset), descriptor(&descriptor) {} StridedMemrefIterator &operator++() { int dim = Rank - 1; - while (dim >= 0 && indices[dim] == (descriptor.sizes[dim] - 1)) { - offset -= indices[dim] * descriptor.strides[dim]; + while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) { + offset -= indices[dim] * descriptor->strides[dim]; indices[dim] = 0; --dim; } @@ -224,17 +231,17 @@ return *this; } ++indices[dim]; - offset += descriptor.strides[dim]; + offset += descriptor->strides[dim]; return *this; } - T &operator*() { return descriptor.data[offset]; } - T *operator->() { return &descriptor.data[offset]; } + reference operator*() { return descriptor->data[offset]; } + pointer operator->() { return &descriptor->data[offset]; } const std::array &getIndices() { return indices; } bool operator==(const StridedMemrefIterator &other) const { - return other.offset == offset && &other.descriptor == &descriptor; + return other.offset == offset && other.descriptor == descriptor; } bool operator!=(const StridedMemrefIterator &other) const { @@ -245,16 +252,24 @@ /// Offset in the buffer. This can be derived from the indices and the /// descriptor. int64_t offset = 0; + /// Array of indices in the multi-dimensional memref. std::array indices = {}; + /// Descriptor for the strided memref. - StridedMemRefType &descriptor; + StridedMemRefType *descriptor; }; /// Iterate over all elements in a 0-ranked strided memref. template class StridedMemrefIterator { public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T *; + using reference = T &; + StridedMemrefIterator(StridedMemRefType &descriptor, int64_t offset = 0) : elt(descriptor.data + offset) {} @@ -263,8 +278,8 @@ return *this; } - T &operator*() { return *elt; } - T *operator->() { return elt; } + reference operator*() { return *elt; } + pointer operator->() { return elt; } // There are no indices for a 0-ranked memref, but this API is provided for // consistency with the general case. @@ -301,10 +316,20 @@ //===----------------------------------------------------------------------===// // DynamicMemRefType type. //===----------------------------------------------------------------------===// +template +class DynamicMemRefIterator; + // A reference to one of the StridedMemRef types. template class DynamicMemRefType { public: + int64_t rank; + T *basePtr; + T *data; + int64_t offset; + const int64_t *sizes; + const int64_t *strides; + explicit DynamicMemRefType(const StridedMemRefType &memRef) : rank(0), basePtr(memRef.basePtr), data(memRef.data), offset(memRef.offset), sizes(nullptr), strides(nullptr) {} @@ -322,12 +347,113 @@ strides = sizes + rank; } - int64_t rank; - T *basePtr; - T *data; - int64_t offset; - const int64_t *sizes; - const int64_t *strides; + template ().begin())> + T &operator[](Range &&indices) { + assert(indices.size() == rank && + "indices should match rank in memref subscript"); + if (rank == 0) + return data[offset]; + + int64_t curOffset = offset; + for (int dim = rank - 1; dim >= 0; --dim) { + int64_t currentIndex = *(indices.begin() + dim); + assert(currentIndex < sizes[dim] && "Index overflow"); + curOffset += currentIndex * strides[dim]; + } + return data[curOffset]; + } + + DynamicMemRefIterator begin() { return {*this}; } + DynamicMemRefIterator end() { return {*this, -1}; } + + // This operator[] is extremely slow and only for sugaring purposes. + DynamicMemRefType operator[](int64_t idx) { + assert(rank > 0 && "can't make a subscript of a zero ranked array"); + + DynamicMemRefType res(*this); + --res.rank; + res.offset += idx * res.strides[0]; + ++res.sizes; + ++res.strides; + return res; + } + + // This operator* can be used in conjunction with the previous operator[] in + // order to access the underlying value in case of zero-ranked memref. + T &operator*() { + assert(rank == 0 && "not a zero-ranked memRef"); + return data[offset]; + } + +private: + DynamicMemRefType(const DynamicMemRefType &other) + : rank(other.rank), basePtr(other.basePtr), data(other.data), + offset(other.offset), strides(other.strides) {} +}; + +/// Iterate over all elements in a dynamic memref. +template +class DynamicMemRefIterator { +public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T *; + using reference = T &; + + DynamicMemRefIterator(DynamicMemRefType &descriptor, int64_t offset = 0) + : offset(offset), descriptor(&descriptor) { + indices.resize(descriptor.rank, 0); + } + + DynamicMemRefIterator &operator++() { + if (descriptor->rank == 0) { + offset = -1; + return *this; + } + + int dim = descriptor->rank - 1; + + while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) { + offset -= indices[dim] * descriptor->strides[dim]; + indices[dim] = 0; + --dim; + } + + if (dim < 0) { + offset = -1; + return *this; + } + + ++indices[dim]; + offset += descriptor->strides[dim]; + return *this; + } + + reference operator*() { return descriptor->data[offset]; } + pointer operator->() { return &descriptor->data[offset]; } + + const std::vector &getIndices() { return indices; } + + bool operator==(const DynamicMemRefIterator &other) const { + return other.offset == offset && other.descriptor == descriptor; + } + + bool operator!=(const DynamicMemRefIterator &other) const { + return !(*this == other); + } + +private: + /// Offset in the buffer. This can be derived from the indices and the + /// descriptor. + int64_t offset = 0; + + /// Array of indices in the multi-dimensional memref. + std::vector indices = {}; + + /// Descriptor for the dynamic memref. + DynamicMemRefType *descriptor; }; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td --- a/mlir/include/mlir/IR/OpBase.td +++ b/mlir/include/mlir/IR/OpBase.td @@ -2288,8 +2288,11 @@ class ElementType : StrFunc<"getElementTypeOrSelf($" # name # ")">; class AllMatchPred values> : - CPred<"::llvm::is_splat(::llvm::makeArrayRef({" - # !interleave(values, ", ") #"}))">; + CPred; class AllMatch values, string summary> : PredOpTrait>; diff --git a/mlir/include/mlir/Transforms/TopologicalSortUtils.h b/mlir/include/mlir/Transforms/TopologicalSortUtils.h --- a/mlir/include/mlir/Transforms/TopologicalSortUtils.h +++ b/mlir/include/mlir/Transforms/TopologicalSortUtils.h @@ -90,11 +90,23 @@ function_ref isOperandReady = nullptr); /// Given a block, sort its operations in topological order, excluding its -/// terminator if it has one. +/// terminator if it has one. This sort is stable. bool sortTopologically( Block *block, function_ref isOperandReady = nullptr); +/// Compute a topological ordering of the given ops. All ops must belong to the +/// specified block. +/// +/// This sort is not stable. +/// +/// Note: If the specified ops contain incomplete/interrupted SSA use-def +/// chains, the result may not actually be a topological sorting with respect to +/// the entire program. +bool computeTopologicalSorting( + Block *block, MutableArrayRef ops, + function_ref isOperandReady = nullptr); + } // end namespace mlir #endif // MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp --- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -49,8 +49,6 @@ // Get the dense lattice to update. AbstractDenseLattice *after = getLattice(op); - if (after->isAtFixpoint()) - return; // If this op implements region control-flow, then control-flow dictates its // transfer function. @@ -91,8 +89,6 @@ // Get the dense lattice to update. AbstractDenseLattice *after = getLattice(block); - if (after->isAtFixpoint()) - return; // The dense lattices of entry blocks are set by region control-flow or the // callgraph. diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp --- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp @@ -87,16 +87,10 @@ // Get the result lattices. SmallVector resultLattices; resultLattices.reserve(op->getNumResults()); - // Track whether all results have reached their fixpoint. - bool allAtFixpoint = true; for (Value result : op->getResults()) { AbstractSparseLattice *resultLattice = getLatticeElement(result); - allAtFixpoint &= resultLattice->isAtFixpoint(); resultLattices.push_back(resultLattice); } - // If all result lattices have reached a fixpoint, there is nothing to do. - if (allAtFixpoint) - return; // The results of a region branch operation are determined by control-flow. if (auto branch = dyn_cast(op)) { @@ -145,16 +139,10 @@ // Get the argument lattices. SmallVector argLattices; argLattices.reserve(block->getNumArguments()); - bool allAtFixpoint = true; for (BlockArgument argument : block->getArguments()) { AbstractSparseLattice *argLattice = getLatticeElement(argument); - allAtFixpoint &= argLattice->isAtFixpoint(); argLattices.push_back(argLattice); } - // If all argument lattices have reached their fixpoints, then there is - // nothing to do. - if (allAtFixpoint) - return; // The argument lattices of entry blocks are set by region control-flow or the // callgraph. diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp --- a/mlir/lib/Bindings/Python/IRAttributes.cpp +++ b/mlir/lib/Bindings/Python/IRAttributes.cpp @@ -113,15 +113,9 @@ /// A python-wrapped dense array attribute with an element type and a derived /// implementation class. template -class PyDenseArrayAttribute - : public PyConcreteAttribute> { +class PyDenseArrayAttribute : public PyConcreteAttribute { public: - static constexpr typename PyConcreteAttribute< - PyDenseArrayAttribute>::IsAFunctionTy isaFunction = - DerivedT::isaFunction; - static constexpr const char *pyClassName = DerivedT::pyClassName; - using PyConcreteAttribute< - PyDenseArrayAttribute>::PyConcreteAttribute; + using PyConcreteAttribute::PyConcreteAttribute; /// Iterator over the integer elements of a dense array. class PyDenseArrayIterator { @@ -158,33 +152,29 @@ EltTy getItem(intptr_t i) { return DerivedT::getElement(*this, i); } /// Bind the attribute class. - static void bindDerived(typename PyConcreteAttribute< - PyDenseArrayAttribute>::ClassTy &c) { + static void bindDerived(typename PyConcreteAttribute::ClassTy &c) { // Bind the constructor. c.def_static( "get", [](const std::vector &values, DefaultingPyMlirContext ctx) { MlirAttribute attr = DerivedT::getAttribute(ctx->get(), values.size(), values.data()); - return PyDenseArrayAttribute(ctx->getRef(), attr); + return DerivedT(ctx->getRef(), attr); }, py::arg("values"), py::arg("context") = py::none(), "Gets a uniqued dense array attribute"); // Bind the array methods. - c.def("__getitem__", - [](PyDenseArrayAttribute &arr, intptr_t i) { - if (i >= mlirDenseArrayGetNumElements(arr)) - throw py::index_error("DenseArray index out of range"); - return arr.getItem(i); - }); - c.def("__len__", [](const PyDenseArrayAttribute &arr) { - return mlirDenseArrayGetNumElements(arr); + c.def("__getitem__", [](DerivedT &arr, intptr_t i) { + if (i >= mlirDenseArrayGetNumElements(arr)) + throw py::index_error("DenseArray index out of range"); + return arr.getItem(i); }); - c.def("__iter__", [](const PyDenseArrayAttribute &arr) { - return PyDenseArrayIterator(arr); + c.def("__len__", [](const DerivedT &arr) { + return mlirDenseArrayGetNumElements(arr); }); - c.def("__add__", [](PyDenseArrayAttribute &arr, - py::list extras) { + c.def("__iter__", + [](const DerivedT &arr) { return PyDenseArrayIterator(arr); }); + c.def("__add__", [](DerivedT &arr, py::list extras) { std::vector values; intptr_t numOldElements = mlirDenseArrayGetNumElements(arr); values.reserve(numOldElements + py::len(extras)); @@ -194,7 +184,7 @@ values.push_back(pyTryCast(attr)); MlirAttribute attr = DerivedT::getAttribute(arr.getContext()->get(), values.size(), values.data()); - return PyDenseArrayAttribute(arr.getContext(), attr); + return DerivedT(arr.getContext(), attr); }); } }; diff --git a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp --- a/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp +++ b/mlir/lib/Conversion/ComplexToLibm/ComplexToLibm.cpp @@ -131,7 +131,8 @@ ConversionTarget target(getContext()); target.addLegalDialect(); target.addIllegalOp(); + complex::CosOp, complex::SinOp, complex::ConjOp, + complex::LogOp, complex::AbsOp, complex::AngleOp>(); if (failed(applyPartialConversion(module, target, std::move(patterns)))) signalPassFailure(); } diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp --- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp @@ -399,13 +399,13 @@ assert(areVarsUnique(*a) && "A's values aren't unique"); assert(areVarsUnique(*b) && "B's values aren't unique"); - assert(std::all_of(a->getMaybeValues().begin() + offset, - a->getMaybeValues().end(), - [](Optional var) { return var.has_value(); })); + assert( + llvm::all_of(llvm::drop_begin(a->getMaybeValues(), offset), + [](const Optional &var) { return var.has_value(); })); - assert(std::all_of(b->getMaybeValues().begin() + offset, - b->getMaybeValues().end(), - [](Optional var) { return var.has_value(); })); + assert( + llvm::all_of(llvm::drop_begin(b->getMaybeValues(), offset), + [](const Optional &var) { return var.has_value(); })); SmallVector aDimValues; a->getValues(offset, a->getNumDimVars(), &aDimValues); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/AllocTensorElimination.cpp @@ -140,6 +140,15 @@ return WalkResult::skip(); Value allocTensor = maybeAllocTensor.front(); + // Replace only if the types match. + // TODO: This could be extended to support IR such as: + // %0 = bufferization.alloc_tensor : tensor<128xf32> + // %1 = "some_op"(%0) : (tensor<128xf32>) -> (tensor<128xf32>) + // %2 = tensor.expand_shape %1 ... + // %3 = tensor.insert_slice %2 into ... + if (allocTensor.getType() != operand.get().getType()) + return WalkResult::skip(); + // Find a suitable insertion point. Operation *insertionPoint = findValidInsertionPoint(allocTensor.getDefiningOp(), neededValues); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -450,6 +450,31 @@ applyPermutationToVector(iteratorTypes, permutation); } + // Handle distribution. Create a vector of the same size of loops that are to + // be tiled. + SmallVector procInfo; + if (options.distribution) { + procInfo.resize( + iteratorTypes.size(), + linalg::ProcInfo{nullptr, nullptr, linalg::DistributionMethod::None}); + // Collect loop ranges of tiled loopss, loops that are parallel. + SmallVector parallelLoopRanges; + for (auto iteratorType : llvm::enumerate(iteratorTypes)) { + if (!isParallelIterator(iteratorType.value())) + break; + parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); + } + auto returnedProcInfo = + options.distribution->procInfo(b, op.getLoc(), parallelLoopRanges); + unsigned procIdIdx = 0; + // Update the distribution information for the loops. + for (auto iteratorType : llvm::enumerate(iteratorTypes)) { + if (!isParallelIterator(iteratorType.value())) + break; + procInfo[iteratorType.index()] = returnedProcInfo[procIdIdx++]; + } + } + // 2. Create the tiled loops. LinalgOp res = op; SmallVector ivs, tensorResults; @@ -489,8 +514,7 @@ return scf::ValueVector(tensorResults.begin(), tensorResults.end()); }; GenerateLoopNest::doit(b, op.getLoc(), loopRanges, op, iteratorTypes, - tiledLoopBodyBuilder, options.distribution, - options.distributionTypes); + tiledLoopBodyBuilder, procInfo); // 3. Transform IndexOp results w.r.t. the tiling. transformIndexOps(b, res, ivs, loopIndexToRangeIndex); diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -518,25 +518,11 @@ function_ref bodyBuilderFn, - Optional distributionOptions, - ArrayRef distributionTypes) { + ArrayRef procInfo) { + assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) && + "expected as many entries for proc info as number of loops, even if " + "they are null entries"); SmallVector iterArgInitValues = linalgOp.getOutputTensorOperands(); - // Create procInfo so it dominates loops, if appropriate. - SmallVector procInfo; - SmallVector distributionMethod; - if (distributionOptions) { - // Collect loop ranges for parallel dimensions. - SmallVector parallelLoopRanges; - for (const auto &iteratorType : enumerate(iteratorTypes)) - if (isParallelIterator(iteratorType.value())) - parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); - - // Get their distribution schemes. - distributionMethod = distributionOptions->distributionMethod; - if (distributionMethod.size() < parallelLoopRanges.size()) - parallelLoopRanges.resize(distributionMethod.size()); - procInfo = distributionOptions->procInfo(b, loc, parallelLoopRanges); - } SmallVector lbs, ubs, steps; unpackRanges(b, loc, loopRanges, lbs, ubs, steps); @@ -554,20 +540,17 @@ return bodyBuilderFn(b, loc, ivs, operandValuesToUse); }); - if (!distributionOptions || loopNest.loops.empty()) + if (loopNest.loops.empty() || procInfo.empty()) return; // Filter out scf.for loops that were created out of parallel dimensions. - SmallVector loops; - for (const auto &iteratorType : enumerate(iteratorTypes)) - if (isParallelIterator(iteratorType.value())) - loops.push_back(loopNest.loops[iteratorType.index()]); - - // Distribute - only supports cyclic distribution for now. - for (auto it : llvm::zip(loops, procInfo, distributionMethod)) - if (std::get<2>(it) == DistributionMethod::Cyclic) - mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId, - std::get<1>(it).nprocs); + for (auto loop : llvm::enumerate(loopNest.loops)) { + if (procInfo[loop.index()].distributionMethod == + DistributionMethod::Cyclic) { + mapLoopToProcessorIds(loop.value(), procInfo[loop.index()].procId, + procInfo[loop.index()].nprocs); + } + } } /// Specialization to build affine "for" nest. @@ -578,7 +561,7 @@ function_ref bodyBuilderFn, - Optional, ArrayRef) { + ArrayRef /*procInfo*/) { SmallVector iterArgInitValues = linalgOp.getOutputTensorOperands(); assert(iterArgInitValues.empty() && "unexpected AffineForOp init values"); SmallVector lbs, ubs, steps; @@ -625,12 +608,13 @@ static void generateParallelLoopNest( OpBuilder &b, Location loc, ValueRange lbs, ValueRange ubs, ValueRange steps, ArrayRef iteratorTypes, + ArrayRef procInfo, function_ref bodyBuilderFn, - SmallVectorImpl &ivStorage, - ArrayRef distributionMethod = {}) { + SmallVectorImpl &ivStorage) { assert(lbs.size() == ubs.size()); assert(lbs.size() == steps.size()); assert(lbs.size() == iteratorTypes.size()); + assert(procInfo.empty() || (lbs.size() == procInfo.size())); // If there are no (more) loops to be generated, generate the body and be // done with it. @@ -639,55 +623,56 @@ return; } - // Find the outermost parallel loops and drop their types from the list. - unsigned nLoops = iteratorTypes.size(); - unsigned nOuterPar = - nLoops - iteratorTypes.drop_while(isParallelIterator).size(); - // If there are no outer parallel loops, generate one sequential loop and - // recurse. Note that we wouldn't have dropped anything from `iteratorTypes` - // in this case. - if (nOuterPar == 0) { + // recurse. + if (!isParallelIterator(iteratorTypes.front())) { LoopNest singleLoop = buildLoopNest( b, loc, lbs.take_front(), ubs.take_front(), steps.take_front(), [&](OpBuilder &b, Location loc, ValueRange ivs) { ivStorage.append(ivs.begin(), ivs.end()); - generateParallelLoopNest(b, loc, lbs.drop_front(), ubs.drop_front(), - steps.drop_front(), - iteratorTypes.drop_front(), bodyBuilderFn, - ivStorage, distributionMethod); + generateParallelLoopNest( + b, loc, lbs.drop_front(), ubs.drop_front(), steps.drop_front(), + iteratorTypes.drop_front(), + procInfo.empty() ? procInfo : procInfo.drop_front(), + bodyBuilderFn, ivStorage); }); return; } - if (distributionMethod.empty()) { + + unsigned nLoops = iteratorTypes.size(); + unsigned numProcessed = 0; + DistributionMethod distributionMethod = DistributionMethod::None; + if (procInfo.empty()) { + numProcessed = nLoops - iteratorTypes.drop_while(isParallelIterator).size(); + } else { + distributionMethod = procInfo.front().distributionMethod; + numProcessed = + nLoops - procInfo + .drop_while([&](linalg::ProcInfo p) { + return p.distributionMethod == distributionMethod; + }) + .size(); + } + + auto remainderProcInfo = + procInfo.empty() ? procInfo : procInfo.drop_front(numProcessed); + switch (distributionMethod) { + case DistributionMethod::None: { // Generate a single parallel loop-nest operation for all outermost // parallel loops and recurse. b.create( - loc, lbs.take_front(nOuterPar), ubs.take_front(nOuterPar), - steps.take_front(nOuterPar), + loc, lbs.take_front(numProcessed), ubs.take_front(numProcessed), + steps.take_front(numProcessed), [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange localIvs) { ivStorage.append(localIvs.begin(), localIvs.end()); generateParallelLoopNest( - nestedBuilder, nestedLoc, lbs.drop_front(nOuterPar), - ubs.drop_front(nOuterPar), steps.drop_front(nOuterPar), - iteratorTypes.drop_front(nOuterPar), bodyBuilderFn, ivStorage, - (distributionMethod.size() < nOuterPar) - ? ArrayRef() - : distributionMethod.drop_front(nOuterPar)); + nestedBuilder, nestedLoc, lbs.drop_front(numProcessed), + ubs.drop_front(numProcessed), steps.drop_front(numProcessed), + iteratorTypes.drop_front(numProcessed), remainderProcInfo, + bodyBuilderFn, ivStorage); }); return; } - - // Process all consecutive similarly distributed loops simultaneously. - DistributionMethod methodToUse = distributionMethod[0]; - unsigned numProcessed = 1; - for (unsigned i = 1; i < nOuterPar && i < distributionMethod.size(); ++i) { - if (distributionMethod[i] != methodToUse) - break; - numProcessed++; - } - - switch (methodToUse) { case DistributionMethod::Cyclic: { // Generate a single parallel loop-nest operation for all outermost // parallel loops and recurse. @@ -699,10 +684,8 @@ generateParallelLoopNest( nestedBuilder, nestedLoc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), steps.drop_front(numProcessed), - iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, - (distributionMethod.size() < numProcessed) - ? ArrayRef() - : distributionMethod.drop_front(numProcessed)); + iteratorTypes.drop_front(numProcessed), remainderProcInfo, + bodyBuilderFn, ivStorage); }); return; } @@ -714,11 +697,11 @@ cond = ab._and(cond, ab.slt(lbs[i], ubs[i])); ivStorage.append(lbs.begin(), std::next(lbs.begin(), numProcessed)); b.create(loc, cond, [&](OpBuilder &b, Location loc) { - generateParallelLoopNest( - b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), - steps.drop_front(numProcessed), - iteratorTypes.drop_front(numProcessed), bodyBuilderFn, ivStorage, - distributionMethod.drop_front(numProcessed)); + generateParallelLoopNest(b, loc, lbs.drop_front(numProcessed), + ubs.drop_front(numProcessed), + steps.drop_front(numProcessed), + iteratorTypes.drop_front(numProcessed), + remainderProcInfo, bodyBuilderFn, ivStorage); b.create(loc, ValueRange{}); }); return; @@ -730,7 +713,7 @@ generateParallelLoopNest( b, loc, lbs.drop_front(numProcessed), ubs.drop_front(numProcessed), steps.drop_front(numProcessed), iteratorTypes.drop_front(numProcessed), - bodyBuilderFn, ivStorage, distributionMethod.drop_front(numProcessed)); + remainderProcInfo, bodyBuilderFn, ivStorage); return; } } @@ -743,13 +726,14 @@ function_ref bodyBuilderFn, - Optional distributionOptions, - ArrayRef distributionTypes) { + ArrayRef procInfo) { SmallVector iterArgInitValues = linalgOp.getOutputTensorOperands(); assert(iterArgInitValues.empty() && "unexpected ParallelOp init values"); // This function may be passed more iterator types than ranges. assert(iteratorTypes.size() >= loopRanges.size() && "expected iterator type for all ranges"); + assert((procInfo.empty() || (procInfo.size() == loopRanges.size())) && + "expected proc information for all loops when present"); iteratorTypes = iteratorTypes.take_front(loopRanges.size()); SmallVector lbsStorage, ubsStorage, stepsStorage, ivs; unsigned numLoops = iteratorTypes.size(); @@ -762,42 +746,22 @@ unpackRanges(b, loc, loopRanges, lbsStorage, ubsStorage, stepsStorage); // Modify the lb, ub, and step based on the distribution options. - SmallVector distributionMethod; - if (distributionOptions) { - auto &options = *distributionOptions; - distributionMethod.assign(distributionOptions->distributionMethod.begin(), - distributionOptions->distributionMethod.end()); - SmallVector parallelLoopRanges; - for (const auto &iteratorType : enumerate(iteratorTypes)) { - if (isParallelIterator(iteratorType.value())) - parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); - } - if (distributionMethod.size() < parallelLoopRanges.size()) - parallelLoopRanges.resize(distributionMethod.size()); - SmallVector procInfo = - options.procInfo(b, loc, parallelLoopRanges); - unsigned index = 0; - for (const auto &iteratorType : enumerate(iteratorTypes)) { - if (index >= procInfo.size()) - break; - if (isParallelIterator(iteratorType.value())) { - unsigned i = iteratorType.index(); - updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId, - procInfo[index].nprocs, lbsStorage[i], - ubsStorage[i], stepsStorage[i]); - index++; - } + for (auto it : llvm::enumerate(procInfo)) { + if (it.value().distributionMethod != linalg::DistributionMethod::None) { + updateBoundsForCyclicDistribution( + b, loc, it.value().procId, it.value().nprocs, lbsStorage[it.index()], + ubsStorage[it.index()], stepsStorage[it.index()]); } } ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage); generateParallelLoopNest( - b, loc, lbs, ubs, steps, iteratorTypes, + b, loc, lbs, ubs, steps, iteratorTypes, procInfo, [&](OpBuilder &b, Location loc, ValueRange ivs) { SmallVector operandValuesToUse = linalgOp.getInputAndOutputOperands(); bodyBuilderFn(b, loc, ivs, operandValuesToUse); }, - ivs, distributionMethod); + ivs); assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops"); } diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp --- a/mlir/lib/Dialect/Math/IR/MathOps.cpp +++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp @@ -134,6 +134,56 @@ }); } +//===----------------------------------------------------------------------===// +// IPowIOp folder +//===----------------------------------------------------------------------===// + +OpFoldResult math::IPowIOp::fold(ArrayRef operands) { + return constFoldBinaryOpConditional( + operands, [](const APInt &base, const APInt &power) -> Optional { + unsigned width = base.getBitWidth(); + auto zeroValue = APInt::getZero(width); + APInt oneValue{width, 1ULL, /*isSigned=*/true}; + APInt minusOneValue{width, -1ULL, /*isSigned=*/true}; + + if (power.isZero()) + return oneValue; + + if (power.isNegative()) { + // Leave 0 raised to negative power not folded. + if (base.isZero()) + return {}; + if (base.eq(oneValue)) + return oneValue; + // If abs(base) > 1, then the result is zero. + if (base.ne(minusOneValue)) + return zeroValue; + // base == -1: + // -1: power is odd + // 1: power is even + if (power[0] == 1) + return minusOneValue; + + return oneValue; + } + + // power is positive. + APInt result = oneValue; + APInt curBase = base; + APInt curPower = power; + while (true) { + if (curPower[0] == 1) + result *= curBase; + curPower.lshrInPlace(1); + if (curPower.isZero()) + return result; + curBase *= curBase; + } + }); + + return Attribute(); +} + //===----------------------------------------------------------------------===// // LogOp folder //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp --- a/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp +++ b/mlir/lib/Dialect/Math/Transforms/AlgebraicSimplification.cpp @@ -112,9 +112,100 @@ return failure(); } +//----------------------------------------------------------------------------// +// IPowIOp strength reduction. +//----------------------------------------------------------------------------// + +namespace { +struct IPowIStrengthReduction : public OpRewritePattern { + unsigned exponentThreshold; + +public: + IPowIStrengthReduction(MLIRContext *context, unsigned exponentThreshold = 3, + PatternBenefit benefit = 1, + ArrayRef generatedNames = {}) + : OpRewritePattern(context, benefit, generatedNames), + exponentThreshold(exponentThreshold) {} + LogicalResult matchAndRewrite(math::IPowIOp op, + PatternRewriter &rewriter) const final; +}; +} // namespace + +LogicalResult +IPowIStrengthReduction::matchAndRewrite(math::IPowIOp op, + PatternRewriter &rewriter) const { + Location loc = op.getLoc(); + Value base = op.getLhs(); + + IntegerAttr scalarExponent; + DenseIntElementsAttr vectorExponent; + + bool isScalar = matchPattern(op.getRhs(), m_Constant(&scalarExponent)); + bool isVector = matchPattern(op.getRhs(), m_Constant(&vectorExponent)); + + // Simplify cases with known exponent value. + int64_t exponentValue = 0; + if (isScalar) + exponentValue = scalarExponent.getInt(); + else if (isVector && vectorExponent.isSplat()) + exponentValue = vectorExponent.getSplatValue().getInt(); + else + return failure(); + + // Maybe broadcasts scalar value into vector type compatible with `op`. + auto bcast = [&](Value value) -> Value { + if (auto vec = op.getType().dyn_cast()) + return rewriter.create(loc, vec, value); + return value; + }; + + if (exponentValue == 0) { + // Replace `ipowi(x, 0)` with `1`. + Value one = rewriter.create( + loc, rewriter.getIntegerAttr(getElementTypeOrSelf(op.getType()), 1)); + rewriter.replaceOp(op, bcast(one)); + return success(); + } + + bool exponentIsNegative = false; + if (exponentValue < 0) { + exponentIsNegative = true; + exponentValue *= -1; + } + + // Bail out if `abs(exponent)` exceeds the threshold. + if (exponentValue > exponentThreshold) + return failure(); + + // Inverse the base for negative exponent, i.e. for + // `ipowi(x, negative_exponent)` set `x` to `1 / x`. + if (exponentIsNegative) { + Value one = rewriter.create( + loc, rewriter.getIntegerAttr(getElementTypeOrSelf(op.getType()), 1)); + base = rewriter.create(loc, bcast(one), base); + } + + Value result = base; + // Transform to naive sequence of multiplications: + // * For positive exponent case replace: + // `ipowi(x, positive_exponent)` + // with: + // x * x * x * ... + // * For negative exponent case replace: + // `ipowi(x, negative_exponent)` + // with: + // (1 / x) * (1 / x) * (1 / x) * ... + for (unsigned i = 1; i < exponentValue; ++i) + result = rewriter.create(loc, result, base); + + rewriter.replaceOp(op, result); + return success(); +} + //----------------------------------------------------------------------------// void mlir::populateMathAlgebraicSimplificationPatterns( RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add( + patterns.getContext()); } diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp --- a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp +++ b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp @@ -42,7 +42,8 @@ PatternRewriter &rewrite) const override { Location location = op->getLoc(); - if (op->hasAttr(op.getTf32EnabledAttrName())) + if (op->hasAttr(op.getTf32EnabledAttrName()) || + !op.getMatrixA().getType().cast().getElementType().isF32()) return failure(); if (precision == MmaSyncF32Lowering::Unkown) diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -348,6 +348,39 @@ return CooperativeMatrixNVType::get(elementTy, scope, dims[0], dims[1]); } +// joint-matrix-type ::= `!spv.jointmatrix` `<`rows `x` columns `x` element-type +// `,` layout `,` scope`>` +static Type parseJointMatrixType(SPIRVDialect const &dialect, + DialectAsmParser &parser) { + if (parser.parseLess()) + return Type(); + + SmallVector dims; + SMLoc countLoc = parser.getCurrentLocation(); + if (parser.parseDimensionList(dims, /*allowDynamic=*/false)) + return Type(); + + if (dims.size() != 2) { + parser.emitError(countLoc, "expected rows and columns size"); + return Type(); + } + + auto elementTy = parseAndVerifyType(dialect, parser); + if (!elementTy) + return Type(); + MatrixLayout matrixLayout; + if (parser.parseComma() || + parseEnumKeywordAttr(matrixLayout, parser, "matrixLayout ")) + return Type(); + Scope scope; + if (parser.parseComma() || parseEnumKeywordAttr(scope, parser, "scope ")) + return Type(); + if (parser.parseGreater()) + return Type(); + return JointMatrixINTELType::get(elementTy, scope, dims[0], dims[1], + matrixLayout); +} + // TODO: Reorder methods to be utilities first and parse*Type // methods in alphabetical order // @@ -753,6 +786,8 @@ return parseArrayType(*this, parser); if (keyword == "coopmatrix") return parseCooperativeMatrixType(*this, parser); + if (keyword == "jointmatrix") + return parseJointMatrixType(*this, parser); if (keyword == "image") return parseImageType(*this, parser); if (keyword == "ptr") @@ -859,6 +894,13 @@ os << ">"; } +static void print(JointMatrixINTELType type, DialectAsmPrinter &os) { + os << "jointmatrix<" << type.getRows() << "x" << type.getColumns() << "x"; + os << type.getElementType() << ", " + << stringifyMatrixLayout(type.getMatrixLayout()); + os << ", " << stringifyScope(type.getScope()) << ">"; +} + static void print(MatrixType type, DialectAsmPrinter &os) { os << "matrix<" << type.getNumColumns() << " x " << type.getColumnType(); os << ">"; @@ -866,9 +908,9 @@ void SPIRVDialect::printType(Type type, DialectAsmPrinter &os) const { TypeSwitch(type) - .Case( - [&](auto type) { print(type, os); }) + .Case([&](auto type) { print(type, os); }) .Default([](Type) { llvm_unreachable("unhandled SPIR-V type"); }); } diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp @@ -436,6 +436,13 @@ resultType.cast().getElementType(); } + if (auto jointMatrixType = + operandType.dyn_cast()) { + operandType = jointMatrixType.getElementType(); + resultType = + resultType.cast().getElementType(); + } + auto operandTypeBitWidth = operandType.getIntOrFloatBitWidth(); auto resultTypeBitWidth = resultType.getIntOrFloatBitWidth(); auto isSameBitWidth = operandTypeBitWidth == resultTypeBitWidth; @@ -1637,6 +1644,17 @@ return success(); } + if (auto jointType = cType.dyn_cast()) { + if (constituents.size() != 1) + return emitOpError("has incorrect number of operands: expected ") + << "1, but provided " << constituents.size(); + if (jointType.getElementType() != constituents.front().getType()) + return emitOpError("operand type mismatch: expected operand type ") + << jointType.getElementType() << ", but provided " + << constituents.front().getType(); + return success(); + } + if (constituents.size() == cType.getNumElements()) { for (auto index : llvm::seq(0, constituents.size())) { if (constituents[index].getType() != cType.getElementType(index)) { @@ -3893,6 +3911,70 @@ return verifyCoopMatrixMulAdd(*this); } +static LogicalResult +verifyPointerAndJointMatrixType(Operation *op, Type pointer, Type jointMatrix) { + Type pointeeType = pointer.cast().getPointeeType(); + if (!pointeeType.isa() && !pointeeType.isa()) + return op->emitError( + "Pointer must point to a scalar or vector type but provided ") + << pointeeType; + spirv::StorageClass storage = + pointer.cast().getStorageClass(); + if (storage != spirv::StorageClass::Workgroup && + storage != spirv::StorageClass::CrossWorkgroup) + return op->emitError("Pointer storage class must be Workgroup or " + "CrossWorkgroup but provided ") + << stringifyStorageClass(storage); + return success(); +} + +//===----------------------------------------------------------------------===// +// spv.JointMatrixLoadINTEL +//===----------------------------------------------------------------------===// + +LogicalResult spirv::JointMatrixLoadINTELOp::verify() { + return verifyPointerAndJointMatrixType(*this, pointer().getType(), + result().getType()); +} + +//===----------------------------------------------------------------------===// +// spv.JointMatrixStoreINTEL +//===----------------------------------------------------------------------===// + +LogicalResult spirv::JointMatrixStoreINTELOp::verify() { + return verifyPointerAndJointMatrixType(*this, pointer().getType(), + object().getType()); +} + +//===----------------------------------------------------------------------===// +// spv.JointMatrixMadINTEL +//===----------------------------------------------------------------------===// + +static LogicalResult verifyJointMatrixMad(spirv::JointMatrixMadINTELOp op) { + if (op.c().getType() != op.result().getType()) + return op.emitOpError("result and third operand must have the same type"); + auto typeA = op.a().getType().cast(); + auto typeB = op.b().getType().cast(); + auto typeC = op.c().getType().cast(); + auto typeR = op.result().getType().cast(); + if (typeA.getRows() != typeR.getRows() || + typeA.getColumns() != typeB.getRows() || + typeB.getColumns() != typeR.getColumns()) + return op.emitOpError("matrix size must match"); + if (typeR.getScope() != typeA.getScope() || + typeR.getScope() != typeB.getScope() || + typeR.getScope() != typeC.getScope()) + return op.emitOpError("matrix scope must match"); + if (typeA.getElementType() != typeB.getElementType() || + typeR.getElementType() != typeC.getElementType()) + return op.emitOpError("matrix element type must match"); + return success(); +} + +LogicalResult spirv::JointMatrixMadINTELOp::verify() { + return verifyJointMatrixMad(*this); +} + //===----------------------------------------------------------------------===// // spv.MatrixTimesScalar //===----------------------------------------------------------------------===// @@ -4150,6 +4232,8 @@ if (cType.isa()) return emitError("unsupported composite type ") << cType; + if (cType.isa()) + return emitError("unsupported composite type ") << cType; if (constituents.size() != cType.getNumElements()) return emitError("has incorrect number of operands: expected ") << cType.getNumElements() << ", but provided " diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp @@ -89,9 +89,9 @@ bool CompositeType::classof(Type type) { if (auto vectorType = type.dyn_cast()) return isValid(vectorType); - return type - .isa(); + return type.isa(); } bool CompositeType::isValid(VectorType type) { @@ -110,7 +110,8 @@ Type CompositeType::getElementType(unsigned index) const { return TypeSwitch(*this) - .Case( + .Case( [](auto type) { return type.getElementType(); }) .Case([](MatrixType type) { return type.getColumnType(); }) .Case( @@ -132,6 +133,10 @@ llvm_unreachable( "invalid to query number of elements of spirv::CooperativeMatrix type"); } + if (isa()) { + llvm_unreachable( + "invalid to query number of elements of spirv::JointMatrix type"); + } if (isa()) { llvm_unreachable( "invalid to query number of elements of spirv::RuntimeArray type"); @@ -140,15 +145,16 @@ } bool CompositeType::hasCompileTimeKnownNumElements() const { - return !isa(); + return !isa(); } void CompositeType::getExtensions( SPIRVType::ExtensionArrayRefVector &extensions, Optional storage) { TypeSwitch(*this) - .Case( + .Case( [&](auto type) { type.getExtensions(extensions, storage); }) .Case([&](VectorType type) { return type.getElementType().cast().getExtensions( @@ -161,8 +167,8 @@ SPIRVType::CapabilityArrayRefVector &capabilities, Optional storage) { TypeSwitch(*this) - .Case( + .Case( [&](auto type) { type.getCapabilities(capabilities, storage); }) .Case([&](VectorType type) { auto vecSize = getNumElements(); @@ -255,6 +261,74 @@ capabilities.push_back(ref); } +//===----------------------------------------------------------------------===// +// JointMatrixType +//===----------------------------------------------------------------------===// + +struct spirv::detail::JointMatrixTypeStorage : public TypeStorage { + using KeyTy = std::tuple; + + static JointMatrixTypeStorage *construct(TypeStorageAllocator &allocator, + const KeyTy &key) { + return new (allocator.allocate()) + JointMatrixTypeStorage(key); + } + + bool operator==(const KeyTy &key) const { + return key == KeyTy(elementType, rows, columns, matrixLayout, scope); + } + + JointMatrixTypeStorage(const KeyTy &key) + : elementType(std::get<0>(key)), rows(std::get<1>(key)), + columns(std::get<2>(key)), scope(std::get<4>(key)), + matrixLayout(std::get<3>(key)) {} + + Type elementType; + unsigned rows; + unsigned columns; + Scope scope; + MatrixLayout matrixLayout; +}; + +JointMatrixINTELType JointMatrixINTELType::get(Type elementType, Scope scope, + unsigned rows, unsigned columns, + MatrixLayout matrixLayout) { + return Base::get(elementType.getContext(), elementType, rows, columns, + matrixLayout, scope); +} + +Type JointMatrixINTELType::getElementType() const { + return getImpl()->elementType; +} + +Scope JointMatrixINTELType::getScope() const { return getImpl()->scope; } + +unsigned JointMatrixINTELType::getRows() const { return getImpl()->rows; } + +unsigned JointMatrixINTELType::getColumns() const { return getImpl()->columns; } + +MatrixLayout JointMatrixINTELType::getMatrixLayout() const { + return getImpl()->matrixLayout; +} + +void JointMatrixINTELType::getExtensions( + SPIRVType::ExtensionArrayRefVector &extensions, + Optional storage) { + getElementType().cast().getExtensions(extensions, storage); + static const Extension exts[] = {Extension::SPV_INTEL_joint_matrix}; + ArrayRef ref(exts, llvm::array_lengthof(exts)); + extensions.push_back(ref); +} + +void JointMatrixINTELType::getCapabilities( + SPIRVType::CapabilityArrayRefVector &capabilities, + Optional storage) { + getElementType().cast().getCapabilities(capabilities, storage); + static const Capability caps[] = {Capability::JointMatrixINTEL}; + ArrayRef ref(caps, llvm::array_lengthof(caps)); + capabilities.push_back(ref); +} + //===----------------------------------------------------------------------===// // ImageType //===----------------------------------------------------------------------===// @@ -1172,6 +1246,7 @@ //===----------------------------------------------------------------------===// void SPIRVDialect::registerTypes() { - addTypes(); + addTypes(); } diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp --- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp +++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp @@ -32,8 +32,8 @@ Value mlir::tosa::clampFloatHelper(Location loc, Value arg, arith::ConstantOp min, arith::ConstantOp max, OpBuilder &rewriter) { - Value minValue = rewriter.create(loc, arg, min); - return rewriter.create(loc, minValue, max); + Value minValue = rewriter.create(loc, arg, max); + return rewriter.create(loc, minValue, min); } Value mlir::tosa::clampIntHelper(Location loc, Value arg, arith::ConstantOp min, diff --git a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp --- a/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/DeserializeOps.cpp @@ -168,6 +168,8 @@ return processType(opcode, operands); case spirv::Opcode::OpTypeForwardPointer: return processTypeForwardPointer(operands); + case spirv::Opcode::OpTypeJointMatrixINTEL: + return processType(opcode, operands); case spirv::Opcode::OpConstant: return processConstant(operands, /*isSpec=*/false); case spirv::Opcode::OpSpecConstant: diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.h @@ -257,6 +257,8 @@ LogicalResult processFunctionType(ArrayRef operands); + LogicalResult processJointMatrixType(ArrayRef operands); + LogicalResult processImageType(ArrayRef operands); LogicalResult processSampledImageType(ArrayRef operands); diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -730,6 +730,8 @@ return processCooperativeMatrixType(operands); case spirv::Opcode::OpTypeFunction: return processFunctionType(operands); + case spirv::Opcode::OpTypeJointMatrixINTEL: + return processJointMatrixType(operands); case spirv::Opcode::OpTypeImage: return processImageType(operands); case spirv::Opcode::OpTypeSampledImage: @@ -888,6 +890,40 @@ return success(); } +LogicalResult +spirv::Deserializer::processJointMatrixType(ArrayRef operands) { + if (operands.size() != 6) { + return emitError(unknownLoc, "OpTypeJointMatrix must have element " + "type and row x column parameters"); + } + + Type elementTy = getType(operands[1]); + if (!elementTy) { + return emitError(unknownLoc, "OpTypeJointMatrix references undefined ") + << operands[1]; + } + + auto scope = spirv::symbolizeScope(getConstantInt(operands[5]).getInt()); + if (!scope) { + return emitError(unknownLoc, + "OpTypeJointMatrix references undefined scope ") + << operands[5]; + } + auto matrixLayout = + spirv::symbolizeMatrixLayout(getConstantInt(operands[4]).getInt()); + if (!matrixLayout) { + return emitError(unknownLoc, + "OpTypeJointMatrix references undefined scope ") + << operands[4]; + } + unsigned rows = getConstantInt(operands[2]).getInt(); + unsigned columns = getConstantInt(operands[3]).getInt(); + + typeMap[operands[0]] = spirv::JointMatrixINTELType::get( + elementTy, scope.value(), rows, columns, matrixLayout.value()); + return success(); +} + LogicalResult spirv::Deserializer::processRuntimeArrayType(ArrayRef operands) { if (operands.size() != 2) { diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp @@ -598,6 +598,27 @@ return success(); } + if (auto jointMatrixType = type.dyn_cast()) { + uint32_t elementTypeID = 0; + if (failed(processTypeImpl(loc, jointMatrixType.getElementType(), + elementTypeID, serializationCtx))) { + return failure(); + } + typeEnum = spirv::Opcode::OpTypeJointMatrixINTEL; + auto getConstantOp = [&](uint32_t id) { + auto attr = IntegerAttr::get(IntegerType::get(type.getContext(), 32), id); + return prepareConstantInt(loc, attr); + }; + operands.push_back(elementTypeID); + operands.push_back(getConstantOp(jointMatrixType.getRows())); + operands.push_back(getConstantOp(jointMatrixType.getColumns())); + operands.push_back(getConstantOp( + static_cast(jointMatrixType.getMatrixLayout()))); + operands.push_back( + getConstantOp(static_cast(jointMatrixType.getScope()))); + return success(); + } + if (auto matrixType = type.dyn_cast()) { uint32_t elementTypeID = 0; if (failed(processTypeImpl(loc, matrixType.getColumnType(), elementTypeID, diff --git a/mlir/lib/Tools/lsp-server-support/Protocol.cpp b/mlir/lib/Tools/lsp-server-support/Protocol.cpp --- a/mlir/lib/Tools/lsp-server-support/Protocol.cpp +++ b/mlir/lib/Tools/lsp-server-support/Protocol.cpp @@ -121,7 +121,7 @@ return false; if (!llvm::isAlpha(scheme[0])) return false; - return std::all_of(scheme.begin() + 1, scheme.end(), [](char c) { + return llvm::all_of(llvm::drop_begin(scheme), [](char c) { return llvm::isAlnum(c) || c == '+' || c == '.' || c == '-'; }); } diff --git a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp b/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp --- a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp +++ b/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp @@ -8,29 +8,19 @@ #include "mlir/Transforms/TopologicalSortUtils.h" #include "mlir/IR/OpDefinition.h" +#include "llvm/ADT/SetVector.h" using namespace mlir; -bool mlir::sortTopologically( - Block *block, llvm::iterator_range ops, - function_ref isOperandReady) { - if (ops.empty()) - return true; - - // The set of operations that have not yet been scheduled. - DenseSet unscheduledOps; - // Mark all operations as unscheduled. - for (Operation &op : ops) - unscheduledOps.insert(&op); - - Block::iterator nextScheduledOp = ops.begin(); - Block::iterator end = ops.end(); - +/// Return `true` if the given operation is ready to be scheduled. +static bool isOpReady(Block *block, Operation *op, + DenseSet &unscheduledOps, + function_ref isOperandReady) { // An operation is ready to be scheduled if all its operands are ready. An // operation is ready if: const auto isReady = [&](Value value, Operation *top) { // - the user-provided callback marks it as ready, - if (isOperandReady && isOperandReady(value, top)) + if (isOperandReady && isOperandReady(value, op)) return true; Operation *parent = value.getDefiningOp(); // - it is a block argument, @@ -41,12 +31,38 @@ if (!ancestor) return true; // - it is defined in a nested region, or - if (ancestor == top) + if (ancestor == op) return true; // - its ancestor in the block is scheduled. return !unscheduledOps.contains(ancestor); }; + // An operation is recursively ready to be scheduled of it and its nested + // operations are ready. + WalkResult readyToSchedule = op->walk([&](Operation *nestedOp) { + return llvm::all_of(nestedOp->getOperands(), + [&](Value operand) { return isReady(operand, op); }) + ? WalkResult::advance() + : WalkResult::interrupt(); + }); + return !readyToSchedule.wasInterrupted(); +} + +bool mlir::sortTopologically( + Block *block, llvm::iterator_range ops, + function_ref isOperandReady) { + if (ops.empty()) + return true; + + // The set of operations that have not yet been scheduled. + DenseSet unscheduledOps; + // Mark all operations as unscheduled. + for (Operation &op : ops) + unscheduledOps.insert(&op); + + Block::iterator nextScheduledOp = ops.begin(); + Block::iterator end = ops.end(); + bool allOpsScheduled = true; while (!unscheduledOps.empty()) { bool scheduledAtLeastOnce = false; @@ -56,16 +72,7 @@ // set, and "schedule" it (move it before the `nextScheduledOp`). for (Operation &op : llvm::make_early_inc_range(llvm::make_range(nextScheduledOp, end))) { - // An operation is recursively ready to be scheduled of it and its nested - // operations are ready. - WalkResult readyToSchedule = op.walk([&](Operation *nestedOp) { - return llvm::all_of( - nestedOp->getOperands(), - [&](Value operand) { return isReady(operand, &op); }) - ? WalkResult::advance() - : WalkResult::interrupt(); - }); - if (readyToSchedule.wasInterrupted()) + if (!isOpReady(block, &op, unscheduledOps, isOperandReady)) continue; // Schedule the operation by moving it to the start. @@ -96,3 +103,48 @@ isOperandReady); return sortTopologically(block, *block, isOperandReady); } + +bool mlir::computeTopologicalSorting( + Block *block, MutableArrayRef ops, + function_ref isOperandReady) { + if (ops.empty()) + return true; + + // The set of operations that have not yet been scheduled. + DenseSet unscheduledOps; + + // Mark all operations as unscheduled. + for (Operation *op : ops) { + assert(op->getBlock() == block && "op must belong to block"); + unscheduledOps.insert(op); + } + + unsigned nextScheduledOp = 0; + + bool allOpsScheduled = true; + while (!unscheduledOps.empty()) { + bool scheduledAtLeastOnce = false; + + // Loop over the ops that are not sorted yet, try to find the ones "ready", + // i.e. the ones for which there aren't any operand produced by an op in the + // set, and "schedule" it (swap it with the op at `nextScheduledOp`). + for (unsigned i = nextScheduledOp; i < ops.size(); ++i) { + if (!isOpReady(block, ops[i], unscheduledOps, isOperandReady)) + continue; + + // Schedule the operation by moving it to the start. + unscheduledOps.erase(ops[i]); + std::swap(ops[i], ops[nextScheduledOp]); + scheduledAtLeastOnce = true; + ++nextScheduledOp; + } + + // If no operations were scheduled, just schedule the first op and continue. + if (!scheduledAtLeastOnce) { + allOpsScheduled = false; + unscheduledOps.erase(ops[nextScheduledOp++]); + } + } + + return allOpsScheduled; +} diff --git a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py --- a/mlir/python/mlir/dialects/_structured_transform_ops_ext.py +++ b/mlir/python/mlir/dialects/_structured_transform_ops_ext.py @@ -110,6 +110,24 @@ ip=ip) +class MatchOp: + """Specialization for MatchOp class.""" + + @classmethod + def match_op_names(MatchOp, + target: Union[Operation, Value], + names: Sequence[str], + loc=None, + ip=None): + pdl_operation_type = pdl.OperationType.get() + return MatchOp( + pdl_operation_type, + _get_op_result_or_value(target), + ops=ArrayAttr.get(list(map(lambda s: StringAttr.get(s), names))), + loc=loc, + ip=ip) + + class MultiTileSizesOp: """Specialization for MultitileSizesOp class.""" diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -467,8 +467,8 @@ // CHECK: ^bb0(%[[ARG1:.+]]: f16, // CHECK-DAG: %[[C0:.+]] = arith.constant 0.0 // CHECK-DAG: %[[C6:.+]] = arith.constant 6.0 - // CHECK-DAG: %[[MIN:.+]] = arith.minf %[[ARG1]], %[[C0]] - // CHECK-DAG: %[[MAX:.+]] = arith.maxf %[[MIN]], %[[C6]] + // CHECK-DAG: %[[MIN:.+]] = arith.minf %[[ARG1]], %[[C6]] + // CHECK-DAG: %[[MAX:.+]] = arith.maxf %[[MIN]], %[[C0]] %0 = "tosa.clamp"(%arg0) {min_int = 0 : i64, max_int = 0 : i64, min_fp = 0.0 : f32, max_fp = 6.0 : f32} : (tensor<1xf16>) -> tensor<1xf16> return diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-alloc-tensor-elimination.mlir @@ -94,7 +94,7 @@ // CHECK: func @insertion_point_outside_loop( // CHECK-SAME: %[[t:.*]]: memref, %[[sz:.*]]: index, %[[idx:.*]]: index) func.func @insertion_point_outside_loop(%t : tensor, %sz : index, - %idx : index) -> (tensor) { + %idx : index) -> (tensor) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c5 = arith.constant 5 : index @@ -118,3 +118,21 @@ return %r : tensor } + +// ----- + +// AllocTensorElimination does currently not apply to chains where the type is +// changing. This test just ensures that we do not crash or generate IR that +// does not verify. + +// CHECK-LABEL: func @shape_mismatch +func.func @shape_mismatch(%t: tensor<5x6x128xf32>) -> tensor<5x6x128xf32> { + %cst = arith.constant 8.0 : f32 + %0 = bufferization.alloc_tensor() : tensor<128xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128xf32>) -> tensor<128xf32> + %2 = tensor.expand_shape %1 [[0, 1, 2]] + : tensor<128xf32> into tensor<1x1x128xf32> + %3 = tensor.insert_slice %2 into %t[2, 3, 0][1, 1, 128][1, 1, 1] + : tensor<1x1x128xf32> into tensor<5x6x128xf32> + return %3 : tensor<5x6x128xf32> +} diff --git a/mlir/test/Dialect/Math/algebraic-simplification.mlir b/mlir/test/Dialect/Math/algebraic-simplification.mlir --- a/mlir/test/Dialect/Math/algebraic-simplification.mlir +++ b/mlir/test/Dialect/Math/algebraic-simplification.mlir @@ -73,3 +73,93 @@ %1 = math.powf %arg1, %v : vector<4xf32> return %0, %1 : f32, vector<4xf32> } + +// CHECK-LABEL: @ipowi_zero_exp( +// CHECK-SAME: %[[ARG0:.+]]: i32 +// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32> +// CHECK-SAME: -> (i32, vector<4xi32>) { +func.func @ipowi_zero_exp(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>) { + // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32 + // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32> + // CHECK: return %[[CST_S]], %[[CST_V]] + %c = arith.constant 0 : i32 + %v = arith.constant dense <0> : vector<4xi32> + %0 = math.ipowi %arg0, %c : i32 + %1 = math.ipowi %arg1, %v : vector<4xi32> + return %0, %1 : i32, vector<4xi32> +} + +// CHECK-LABEL: @ipowi_exp_one( +// CHECK-SAME: %[[ARG0:.+]]: i32 +// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32> +// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) { +func.func @ipowi_exp_one(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) { + // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32 + // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32> + // CHECK: %[[SCALAR:.*]] = arith.divsi %[[CST_S]], %[[ARG0]] + // CHECK: %[[VECTOR:.*]] = arith.divsi %[[CST_V]], %[[ARG1]] + // CHECK: return %[[ARG0]], %[[ARG1]], %[[SCALAR]], %[[VECTOR]] + %c1 = arith.constant 1 : i32 + %v1 = arith.constant dense <1> : vector<4xi32> + %0 = math.ipowi %arg0, %c1 : i32 + %1 = math.ipowi %arg1, %v1 : vector<4xi32> + %cm1 = arith.constant -1 : i32 + %vm1 = arith.constant dense <-1> : vector<4xi32> + %2 = math.ipowi %arg0, %cm1 : i32 + %3 = math.ipowi %arg1, %vm1 : vector<4xi32> + return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32> +} + +// CHECK-LABEL: @ipowi_exp_two( +// CHECK-SAME: %[[ARG0:.+]]: i32 +// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32> +// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) { +func.func @ipowi_exp_two(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) { + // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32 + // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32> + // CHECK: %[[SCALAR0:.*]] = arith.muli %[[ARG0]], %[[ARG0]] + // CHECK: %[[VECTOR0:.*]] = arith.muli %[[ARG1]], %[[ARG1]] + // CHECK: %[[SCALAR1:.*]] = arith.divsi %[[CST_S]], %[[ARG0]] + // CHECK: %[[SMUL:.*]] = arith.muli %[[SCALAR1]], %[[SCALAR1]] + // CHECK: %[[VECTOR1:.*]] = arith.divsi %[[CST_V]], %[[ARG1]] + // CHECK: %[[VMUL:.*]] = arith.muli %[[VECTOR1]], %[[VECTOR1]] + // CHECK: return %[[SCALAR0]], %[[VECTOR0]], %[[SMUL]], %[[VMUL]] + %c1 = arith.constant 2 : i32 + %v1 = arith.constant dense <2> : vector<4xi32> + %0 = math.ipowi %arg0, %c1 : i32 + %1 = math.ipowi %arg1, %v1 : vector<4xi32> + %cm1 = arith.constant -2 : i32 + %vm1 = arith.constant dense <-2> : vector<4xi32> + %2 = math.ipowi %arg0, %cm1 : i32 + %3 = math.ipowi %arg1, %vm1 : vector<4xi32> + return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32> +} + +// CHECK-LABEL: @ipowi_exp_three( +// CHECK-SAME: %[[ARG0:.+]]: i32 +// CHECK-SAME: %[[ARG1:.+]]: vector<4xi32> +// CHECK-SAME: -> (i32, vector<4xi32>, i32, vector<4xi32>) { +func.func @ipowi_exp_three(%arg0: i32, %arg1: vector<4xi32>) -> (i32, vector<4xi32>, i32, vector<4xi32>) { + // CHECK: %[[CST_S:.*]] = arith.constant 1 : i32 + // CHECK: %[[CST_V:.*]] = arith.constant dense<1> : vector<4xi32> + // CHECK: %[[SMUL0:.*]] = arith.muli %[[ARG0]], %[[ARG0]] + // CHECK: %[[SCALAR0:.*]] = arith.muli %[[SMUL0]], %[[ARG0]] + // CHECK: %[[VMUL0:.*]] = arith.muli %[[ARG1]], %[[ARG1]] + // CHECK: %[[VECTOR0:.*]] = arith.muli %[[VMUL0]], %[[ARG1]] + // CHECK: %[[SCALAR1:.*]] = arith.divsi %[[CST_S]], %[[ARG0]] + // CHECK: %[[SMUL1:.*]] = arith.muli %[[SCALAR1]], %[[SCALAR1]] + // CHECK: %[[SMUL2:.*]] = arith.muli %[[SMUL1]], %[[SCALAR1]] + // CHECK: %[[VECTOR1:.*]] = arith.divsi %[[CST_V]], %[[ARG1]] + // CHECK: %[[VMUL1:.*]] = arith.muli %[[VECTOR1]], %[[VECTOR1]] + // CHECK: %[[VMUL2:.*]] = arith.muli %[[VMUL1]], %[[VECTOR1]] + // CHECK: return %[[SCALAR0]], %[[VECTOR0]], %[[SMUL2]], %[[VMUL2]] + %c1 = arith.constant 3 : i32 + %v1 = arith.constant dense <3> : vector<4xi32> + %0 = math.ipowi %arg0, %c1 : i32 + %1 = math.ipowi %arg1, %v1 : vector<4xi32> + %cm1 = arith.constant -3 : i32 + %vm1 = arith.constant dense <-3> : vector<4xi32> + %2 = math.ipowi %arg0, %cm1 : i32 + %3 = math.ipowi %arg1, %vm1 : vector<4xi32> + return %0, %1, %2, %3 : i32, vector<4xi32>, i32, vector<4xi32> +} diff --git a/mlir/test/Dialect/Math/canonicalize_ipowi.mlir b/mlir/test/Dialect/Math/canonicalize_ipowi.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Math/canonicalize_ipowi.mlir @@ -0,0 +1,442 @@ +// RUN: mlir-opt %s -canonicalize | FileCheck %s + +// CHECK-LABEL: @ipowi32_fold( +// CHECK-SAME: %[[result:.+]]: memref +func.func @ipowi32_fold(%result : memref) { +// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i32 +// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i32 +// CHECK-DAG: %[[cst1073741824:.+]] = arith.constant 1073741824 : i32 +// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i32 +// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i32 +// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index +// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index +// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[i11:.+]] = arith.constant 11 : index + +// --- Test power == 0 --- + %arg0_base = arith.constant 0 : i32 + %arg0_power = arith.constant 0 : i32 + %res0 = math.ipowi %arg0_base, %arg0_power : i32 + %i0 = arith.constant 0 : index + memref.store %res0, %result[%i0] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref + + %arg1_base = arith.constant 10 : i32 + %arg1_power = arith.constant 0 : i32 + %res1 = math.ipowi %arg1_base, %arg1_power : i32 + %i1 = arith.constant 1 : index + memref.store %res1, %result[%i1] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref + + %arg2_base = arith.constant -10 : i32 + %arg2_power = arith.constant 0 : i32 + %res2 = math.ipowi %arg2_base, %arg2_power : i32 + %i2 = arith.constant 2 : index + memref.store %res2, %result[%i2] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref + +// --- Test negative powers --- + %arg3_base = arith.constant 0 : i32 + %arg3_power = arith.constant -1 : i32 + %res3 = math.ipowi %arg3_base, %arg3_power : i32 + %i3 = arith.constant 3 : index + memref.store %res3, %result[%i3] : memref +// No folding for ipowi(0, x) for x < 0: +// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i32 +// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref + + %arg4_base = arith.constant 1 : i32 + %arg4_power = arith.constant -10 : i32 + %res4 = math.ipowi %arg4_base, %arg4_power : i32 + %i4 = arith.constant 4 : index + memref.store %res4, %result[%i4] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref + + %arg5_base = arith.constant 2 : i32 + %arg5_power = arith.constant -1 : i32 + %res5 = math.ipowi %arg5_base, %arg5_power : i32 + %i5 = arith.constant 5 : index + memref.store %res5, %result[%i5] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref + + %arg6_base = arith.constant -2 : i32 + %arg6_power = arith.constant -1 : i32 + %res6 = math.ipowi %arg6_base, %arg6_power : i32 + %i6 = arith.constant 6 : index + memref.store %res6, %result[%i6] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref + + %arg7_base = arith.constant -1 : i32 + %arg7_power = arith.constant -10 : i32 + %res7 = math.ipowi %arg7_base, %arg7_power : i32 + %i7 = arith.constant 7 : index + memref.store %res7, %result[%i7] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref + + %arg8_base = arith.constant -1 : i32 + %arg8_power = arith.constant -11 : i32 + %res8 = math.ipowi %arg8_base, %arg8_power : i32 + %i8 = arith.constant 8 : index + memref.store %res8, %result[%i8] : memref +// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref + +// --- Test positive powers --- + %arg9_base = arith.constant -3 : i32 + %arg9_power = arith.constant 3 : i32 + %res9 = math.ipowi %arg9_base, %arg9_power : i32 + %i9 = arith.constant 9 : index + memref.store %res9, %result[%i9] : memref +// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref + + %arg10_base = arith.constant 2 : i32 + %arg10_power = arith.constant 30 : i32 + %res10 = math.ipowi %arg10_base, %arg10_power : i32 + %i10 = arith.constant 10 : index + memref.store %res10, %result[%i10] : memref +// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i10]]] : memref + +// --- Test vector folding --- + %arg11_base = arith.constant 2 : i32 + %arg11_base_vec = vector.splat %arg11_base : vector<2x2xi32> + %arg11_power = arith.constant 30 : i32 + %arg11_power_vec = vector.splat %arg11_power : vector<2x2xi32> + %res11_vec = math.ipowi %arg11_base_vec, %arg11_power_vec : vector<2x2xi32> + %i11 = arith.constant 11 : index + %res11 = vector.extract %res11_vec[1, 1] : vector<2x2xi32> + memref.store %res11, %result[%i11] : memref +// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i11]]] : memref + + return +} + +// CHECK-LABEL: @ipowi64_fold( +// CHECK-SAME: %[[result:.+]]: memref +func.func @ipowi64_fold(%result : memref) { +// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i64 +// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i64 +// CHECK-DAG: %[[cst1073741824:.+]] = arith.constant 1073741824 : i64 +// CHECK-DAG: %[[cst281474976710656:.+]] = arith.constant 281474976710656 : i64 +// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i64 +// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i64 +// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index +// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index +// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index +// CHECK-DAG: %[[i11:.+]] = arith.constant 11 : index + +// --- Test power == 0 --- + %arg0_base = arith.constant 0 : i64 + %arg0_power = arith.constant 0 : i64 + %res0 = math.ipowi %arg0_base, %arg0_power : i64 + %i0 = arith.constant 0 : index + memref.store %res0, %result[%i0] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref + + %arg1_base = arith.constant 10 : i64 + %arg1_power = arith.constant 0 : i64 + %res1 = math.ipowi %arg1_base, %arg1_power : i64 + %i1 = arith.constant 1 : index + memref.store %res1, %result[%i1] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref + + %arg2_base = arith.constant -10 : i64 + %arg2_power = arith.constant 0 : i64 + %res2 = math.ipowi %arg2_base, %arg2_power : i64 + %i2 = arith.constant 2 : index + memref.store %res2, %result[%i2] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref + +// --- Test negative powers --- + %arg3_base = arith.constant 0 : i64 + %arg3_power = arith.constant -1 : i64 + %res3 = math.ipowi %arg3_base, %arg3_power : i64 + %i3 = arith.constant 3 : index + memref.store %res3, %result[%i3] : memref +// No folding for ipowi(0, x) for x < 0: +// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i64 +// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref + + %arg4_base = arith.constant 1 : i64 + %arg4_power = arith.constant -10 : i64 + %res4 = math.ipowi %arg4_base, %arg4_power : i64 + %i4 = arith.constant 4 : index + memref.store %res4, %result[%i4] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref + + %arg5_base = arith.constant 2 : i64 + %arg5_power = arith.constant -1 : i64 + %res5 = math.ipowi %arg5_base, %arg5_power : i64 + %i5 = arith.constant 5 : index + memref.store %res5, %result[%i5] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref + + %arg6_base = arith.constant -2 : i64 + %arg6_power = arith.constant -1 : i64 + %res6 = math.ipowi %arg6_base, %arg6_power : i64 + %i6 = arith.constant 6 : index + memref.store %res6, %result[%i6] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref + + %arg7_base = arith.constant -1 : i64 + %arg7_power = arith.constant -10 : i64 + %res7 = math.ipowi %arg7_base, %arg7_power : i64 + %i7 = arith.constant 7 : index + memref.store %res7, %result[%i7] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref + + %arg8_base = arith.constant -1 : i64 + %arg8_power = arith.constant -11 : i64 + %res8 = math.ipowi %arg8_base, %arg8_power : i64 + %i8 = arith.constant 8 : index + memref.store %res8, %result[%i8] : memref +// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref + +// --- Test positive powers --- + %arg9_base = arith.constant -3 : i64 + %arg9_power = arith.constant 3 : i64 + %res9 = math.ipowi %arg9_base, %arg9_power : i64 + %i9 = arith.constant 9 : index + memref.store %res9, %result[%i9] : memref +// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref + + %arg10_base = arith.constant 2 : i64 + %arg10_power = arith.constant 30 : i64 + %res10 = math.ipowi %arg10_base, %arg10_power : i64 + %i10 = arith.constant 10 : index + memref.store %res10, %result[%i10] : memref +// CHECK: memref.store %[[cst1073741824]], %[[result]][%[[i10]]] : memref + + %arg11_base = arith.constant 2 : i64 + %arg11_power = arith.constant 48 : i64 + %res11 = math.ipowi %arg11_base, %arg11_power : i64 + %i11 = arith.constant 11 : index + memref.store %res11, %result[%i11] : memref +// CHECK: memref.store %[[cst281474976710656]], %[[result]][%[[i11]]] : memref + + return +} + +// CHECK-LABEL: @ipowi16_fold( +// CHECK-SAME: %[[result:.+]]: memref +func.func @ipowi16_fold(%result : memref) { +// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i16 +// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i16 +// CHECK-DAG: %[[cst16384:.+]] = arith.constant 16384 : i16 +// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i16 +// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i16 +// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index +// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index +// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index + +// --- Test power == 0 --- + %arg0_base = arith.constant 0 : i16 + %arg0_power = arith.constant 0 : i16 + %res0 = math.ipowi %arg0_base, %arg0_power : i16 + %i0 = arith.constant 0 : index + memref.store %res0, %result[%i0] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref + + %arg1_base = arith.constant 10 : i16 + %arg1_power = arith.constant 0 : i16 + %res1 = math.ipowi %arg1_base, %arg1_power : i16 + %i1 = arith.constant 1 : index + memref.store %res1, %result[%i1] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref + + %arg2_base = arith.constant -10 : i16 + %arg2_power = arith.constant 0 : i16 + %res2 = math.ipowi %arg2_base, %arg2_power : i16 + %i2 = arith.constant 2 : index + memref.store %res2, %result[%i2] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref + +// --- Test negative powers --- + %arg3_base = arith.constant 0 : i16 + %arg3_power = arith.constant -1 : i16 + %res3 = math.ipowi %arg3_base, %arg3_power : i16 + %i3 = arith.constant 3 : index + memref.store %res3, %result[%i3] : memref +// No folding for ipowi(0, x) for x < 0: +// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i16 +// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref + + %arg4_base = arith.constant 1 : i16 + %arg4_power = arith.constant -10 : i16 + %res4 = math.ipowi %arg4_base, %arg4_power : i16 + %i4 = arith.constant 4 : index + memref.store %res4, %result[%i4] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref + + %arg5_base = arith.constant 2 : i16 + %arg5_power = arith.constant -1 : i16 + %res5 = math.ipowi %arg5_base, %arg5_power : i16 + %i5 = arith.constant 5 : index + memref.store %res5, %result[%i5] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref + + %arg6_base = arith.constant -2 : i16 + %arg6_power = arith.constant -1 : i16 + %res6 = math.ipowi %arg6_base, %arg6_power : i16 + %i6 = arith.constant 6 : index + memref.store %res6, %result[%i6] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref + + %arg7_base = arith.constant -1 : i16 + %arg7_power = arith.constant -10 : i16 + %res7 = math.ipowi %arg7_base, %arg7_power : i16 + %i7 = arith.constant 7 : index + memref.store %res7, %result[%i7] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref + + %arg8_base = arith.constant -1 : i16 + %arg8_power = arith.constant -11 : i16 + %res8 = math.ipowi %arg8_base, %arg8_power : i16 + %i8 = arith.constant 8 : index + memref.store %res8, %result[%i8] : memref +// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref + +// --- Test positive powers --- + %arg9_base = arith.constant -3 : i16 + %arg9_power = arith.constant 3 : i16 + %res9 = math.ipowi %arg9_base, %arg9_power : i16 + %i9 = arith.constant 9 : index + memref.store %res9, %result[%i9] : memref +// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref + + %arg10_base = arith.constant 2 : i16 + %arg10_power = arith.constant 14 : i16 + %res10 = math.ipowi %arg10_base, %arg10_power : i16 + %i10 = arith.constant 10 : index + memref.store %res10, %result[%i10] : memref +// CHECK: memref.store %[[cst16384]], %[[result]][%[[i10]]] : memref + + return +} + +// CHECK-LABEL: @ipowi8_fold( +// CHECK-SAME: %[[result:.+]]: memref +func.func @ipowi8_fold(%result : memref) { +// CHECK-DAG: %[[cst0:.+]] = arith.constant 0 : i8 +// CHECK-DAG: %[[cst1:.+]] = arith.constant 1 : i8 +// CHECK-DAG: %[[cst64:.+]] = arith.constant 64 : i8 +// CHECK-DAG: %[[cst_m1:.+]] = arith.constant -1 : i8 +// CHECK-DAG: %[[cst_m27:.+]] = arith.constant -27 : i8 +// CHECK-DAG: %[[i0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[i1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[i2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[i3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[i4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[i5:.+]] = arith.constant 5 : index +// CHECK-DAG: %[[i6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[i7:.+]] = arith.constant 7 : index +// CHECK-DAG: %[[i8:.+]] = arith.constant 8 : index +// CHECK-DAG: %[[i9:.+]] = arith.constant 9 : index +// CHECK-DAG: %[[i10:.+]] = arith.constant 10 : index + +// --- Test power == 0 --- + %arg0_base = arith.constant 0 : i8 + %arg0_power = arith.constant 0 : i8 + %res0 = math.ipowi %arg0_base, %arg0_power : i8 + %i0 = arith.constant 0 : index + memref.store %res0, %result[%i0] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i0]]] : memref + + %arg1_base = arith.constant 10 : i8 + %arg1_power = arith.constant 0 : i8 + %res1 = math.ipowi %arg1_base, %arg1_power : i8 + %i1 = arith.constant 1 : index + memref.store %res1, %result[%i1] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i1]]] : memref + + %arg2_base = arith.constant -10 : i8 + %arg2_power = arith.constant 0 : i8 + %res2 = math.ipowi %arg2_base, %arg2_power : i8 + %i2 = arith.constant 2 : index + memref.store %res2, %result[%i2] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i2]]] : memref + +// --- Test negative powers --- + %arg3_base = arith.constant 0 : i8 + %arg3_power = arith.constant -1 : i8 + %res3 = math.ipowi %arg3_base, %arg3_power : i8 + %i3 = arith.constant 3 : index + memref.store %res3, %result[%i3] : memref +// No folding for ipowi(0, x) for x < 0: +// CHECK: %[[res3:.+]] = math.ipowi %[[cst0]], %[[cst_m1]] : i8 +// CHECK: memref.store %[[res3]], %[[result]][%[[i3]]] : memref + + %arg4_base = arith.constant 1 : i8 + %arg4_power = arith.constant -10 : i8 + %res4 = math.ipowi %arg4_base, %arg4_power : i8 + %i4 = arith.constant 4 : index + memref.store %res4, %result[%i4] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i4]]] : memref + + %arg5_base = arith.constant 2 : i8 + %arg5_power = arith.constant -1 : i8 + %res5 = math.ipowi %arg5_base, %arg5_power : i8 + %i5 = arith.constant 5 : index + memref.store %res5, %result[%i5] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i5]]] : memref + + %arg6_base = arith.constant -2 : i8 + %arg6_power = arith.constant -1 : i8 + %res6 = math.ipowi %arg6_base, %arg6_power : i8 + %i6 = arith.constant 6 : index + memref.store %res6, %result[%i6] : memref +// CHECK: memref.store %[[cst0]], %[[result]][%[[i6]]] : memref + + %arg7_base = arith.constant -1 : i8 + %arg7_power = arith.constant -10 : i8 + %res7 = math.ipowi %arg7_base, %arg7_power : i8 + %i7 = arith.constant 7 : index + memref.store %res7, %result[%i7] : memref +// CHECK: memref.store %[[cst1]], %[[result]][%[[i7]]] : memref + + %arg8_base = arith.constant -1 : i8 + %arg8_power = arith.constant -11 : i8 + %res8 = math.ipowi %arg8_base, %arg8_power : i8 + %i8 = arith.constant 8 : index + memref.store %res8, %result[%i8] : memref +// CHECK: memref.store %[[cst_m1]], %[[result]][%[[i8]]] : memref + +// --- Test positive powers --- + %arg9_base = arith.constant -3 : i8 + %arg9_power = arith.constant 3 : i8 + %res9 = math.ipowi %arg9_base, %arg9_power : i8 + %i9 = arith.constant 9 : index + memref.store %res9, %result[%i9] : memref +// CHECK: memref.store %[[cst_m27]], %[[result]][%[[i9]]] : memref + + %arg10_base = arith.constant 2 : i8 + %arg10_power = arith.constant 6 : i8 + %res10 = math.ipowi %arg10_base, %arg10_power : i8 + %i10 = arith.constant 10 : index + memref.store %res10, %result[%i10] : memref +// CHECK: memref.store %[[cst64]], %[[result]][%[[i10]]] : memref + + return +} diff --git a/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir b/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir --- a/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir +++ b/mlir/test/Dialect/NVGPU/mma-sync-f32-to-tf32.mlir @@ -18,3 +18,12 @@ return %d : vector<2x2xf32> } // ----- + +// Negative test for non f32 case. +// CHECK-LABEL: mma_sync_f16 +// CHECK-NOT: tf32Enabled +// CHECK: return +func.func @mma_sync_f16(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> { + %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16> + return %d : vector<2x2xf16> +} diff --git a/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir b/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SPIRV/IR/joint-matrix-ops.mlir @@ -0,0 +1,158 @@ +// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -verify-diagnostics %s | FileCheck %s + +// CHECK-LABEL: @joint_matrix_load +spv.func @joint_matrix_load(%ptr : !spv.ptr, %stride : i32) "None" { + // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL {{%.*}}, {{%.*}} : (!spv.ptr, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> + %0 = spv.JointMatrixLoadINTEL %ptr, %stride : (!spv.ptr, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> + spv.Return +} + +// ----- +// CHECK-LABEL: @joint_matrix_load_memaccess +spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr, %stride : i32) "None" { + // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL {{%.*}}, {{%.*}} {memory_access = #spv.memory_access} : (!spv.ptr, i32) -> !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup> + %0 = spv.JointMatrixLoadINTEL %ptr, %stride {memory_access = #spv.memory_access} : (!spv.ptr, i32) -> !spv.jointmatrix<8x16xi32, ColumnMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_load_diff_ptr_type +spv.func @joint_matrix_load_diff_ptr_type(%ptr : !spv.ptr, Workgroup>, %stride : i32) "None" { + // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL {{%.*}}, {{%.*}} {memory_access = #spv.memory_access} : (!spv.ptr, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Workgroup> + %0 = spv.JointMatrixLoadINTEL %ptr, %stride {memory_access = #spv.memory_access} : (!spv.ptr, Workgroup>, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Workgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_store +spv.func @joint_matrix_store(%ptr : !spv.ptr, %stride : i32, %m : !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>) "None" { + // CHECK: spv.JointMatrixStoreINTEL {{%.*}}, {{%.*}}, {{%.*}} : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>, i32) + spv.JointMatrixStoreINTEL %ptr, %m, %stride : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Workgroup>, i32) + spv.Return +} + +// CHECK-LABEL: @joint_matrix_store_memaccess +spv.func @joint_matrix_store_memaccess(%ptr : !spv.ptr, %m : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %stride : i32) "None" { + // CHECK: spv.JointMatrixStoreINTEL {{%.*}}, {{%.*}}, {{%.*}} {Volatile} : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32) + spv.JointMatrixStoreINTEL %ptr, %m, %stride {Volatile} : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32) + spv.Return +} + +// CHECK-LABEL: @joint_matrix_length +spv.func @joint_matrix_length() -> i32 "None" { + // CHECK: {{%.*}} = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, PackedB, Subgroup> + %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, PackedB, Subgroup> + spv.ReturnValue %0 : i32 +} + +// CHECK-LABEL: @joint_matrix_muladd +spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, %b : !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.JointMatrixMadINTEL {{%.*}}, {{%.*}}, {{%.*}} : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<8x32xi8, RowMajor, Subgroup>, !spv.jointmatrix<32x8xi8, ColumnMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_add +spv.func @joint_matrix_add(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.IAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.IAdd %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_sub +spv.func @joint_matrix_sub(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.ISub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.ISub %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_sdiv +spv.func @joint_matrix_sdiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.SDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.SDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_udiv +spv.func @joint_matrix_udiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.UDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.UDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_fadd +spv.func @joint_matrix_fadd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FAdd %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_fsub +spv.func @joint_matrix_fsub(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FSub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FSub %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return +} + +// CHECK-LABEL: @joint_matrix_fdiv +spv.func @joint_matrix_fdiv(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FDiv %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +// CHECK-LABEL: @joint_matrix_access_chain +spv.func @joint_matrix_access_chain(%a : !spv.ptr, Function>) -> !spv.ptr "None" { + %0 = spv.Constant 0: i32 + // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr, Function>, i32 + %1 = spv.AccessChain %a[%0] : !spv.ptr, Function>, i32 + spv.ReturnValue %1 : !spv.ptr +} + +// ----- + +spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<16x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix size must match}} + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<16x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix size must match}} + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // expected-error @+1 {{'spv.JointMatrixMadINTEL' op matrix scope must match}} + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // expected-error @+1 {{matrix element type must match}} + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr, Workgroup>, %stride : i32) "None" { + // expected-error @+1 {{Pointer must point to a scalar or vector type}} + %0 = spv.JointMatrixLoadINTEL %ptr, %stride : (!spv.ptr, Workgroup>, i32)-> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} + +// ----- + +spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr, %stride : i32) "None" { + // expected-error @+1 {{Pointer storage class must be Workgroup or CrossWorkgroup}} + %0 = spv.JointMatrixLoadINTEL %ptr, %stride : (!spv.ptr, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return +} diff --git a/mlir/test/Target/SPIRV/joint-matrix-ops.mlir b/mlir/test/Target/SPIRV/joint-matrix-ops.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Target/SPIRV/joint-matrix-ops.mlir @@ -0,0 +1,102 @@ +// RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s + +spv.module Logical GLSL450 requires #spv.vce { + // CHECK-LABEL: @joint_matrix_load + spv.func @joint_matrix_load(%ptr : !spv.ptr, %stride : i32) "None" { + // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL {{%.*}}, {{%.*}} : (!spv.ptr, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> + %0 = spv.JointMatrixLoadINTEL %ptr, %stride : (!spv.ptr, i32) -> !spv.jointmatrix<16x8xi32, RowMajor, Workgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_load_memaccess + spv.func @joint_matrix_load_memaccess(%ptr : !spv.ptr, %stride : i32) "None" { + // CHECK: {{%.*}} = spv.JointMatrixLoadINTEL {{%.*}}, {{%.*}} {memory_access = #spv.memory_access} : (!spv.ptr, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %0 = spv.JointMatrixLoadINTEL %ptr, %stride {memory_access = #spv.memory_access} : (!spv.ptr, i32) -> !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_store + spv.func @joint_matrix_store(%ptr : !spv.ptr, %stride : i32, %m : !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>) "None" { + // CHECK: spv.JointMatrixStoreINTEL {{%.*}}, {{%.*}}, {{%.*}} : (!spv.ptr, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, i32) + spv.JointMatrixStoreINTEL %ptr, %m, %stride : (!spv.ptr, !spv.jointmatrix<16x8xi32, RowMajor, Workgroup>, i32) + spv.Return + } + + // CHECK-LABEL: @joint_matrix_store_memaccess + spv.func @joint_matrix_store_memaccess(%ptr : !spv.ptr, %m : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %stride : i32) "None" { + // CHECK: spv.JointMatrixStoreINTEL {{%.*}}, {{%.*}}, {{%.*}} {memory_access = #spv.memory_access} : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32) + spv.JointMatrixStoreINTEL %ptr, %m, %stride {memory_access = #spv.memory_access} : (!spv.ptr, !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, i32) + spv.Return + } + + // CHECK-LABEL: @joint_matrix_length + spv.func @joint_matrix_length() -> i32 "None" { + // CHECK: {{%.*}} = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %0 = spv.JointMatrixWorkItemLengthINTEL : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.ReturnValue %0 : i32 + } + + // CHECK-LABEL: @joint_matrix_muladd + spv.func @joint_matrix_muladd(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<16x8xi32, RowMajor, Subgroup>, %c : !spv.jointmatrix<8x8xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.JointMatrixMadINTEL {{%.*}}, {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + %r = spv.JointMatrixMadINTEL %a, %b, %c : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, !spv.jointmatrix<16x8xi32, RowMajor, Subgroup> -> !spv.jointmatrix<8x8xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_add + spv.func @joint_matrix_add(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.IAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.IAdd %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_sub + spv.func @joint_matrix_sub(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.ISub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.ISub %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_sdiv + spv.func @joint_matrix_sdiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.SDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.SDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_udiv + spv.func @joint_matrix_udiv(%a : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.UDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + %r = spv.UDiv %a, %b : !spv.jointmatrix<8x16xi32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_fadd + spv.func @joint_matrix_fadd(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FAdd {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FAdd %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_fsub + spv.func @joint_matrix_fsub(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FSub {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FSub %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_fdiv + spv.func @joint_matrix_fdiv(%a : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup>) "None" { + // CHECK: {{%.*}} = spv.FDiv {{%.*}}, {{%.*}} : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + %r = spv.FDiv %a, %b : !spv.jointmatrix<8x16xf32, RowMajor, Subgroup> + spv.Return + } + + // CHECK-LABEL: @joint_matrix_access_chain + spv.func @joint_matrix_access_chain(%a : !spv.ptr, Function>) -> !spv.ptr "None" { + %0 = spv.Constant 0: i32 + // CHECK: {{%.*}} = spv.AccessChain {{%.*}}[{{%.*}}] : !spv.ptr, Function>, i32 + %1 = spv.AccessChain %a[%0] : !spv.ptr, Function>, i32 + spv.ReturnValue %1 : !spv.ptr + } +} diff --git a/mlir/test/Target/SPIRV/memory-ops.mlir b/mlir/test/Target/SPIRV/memory-ops.mlir --- a/mlir/test/Target/SPIRV/memory-ops.mlir +++ b/mlir/test/Target/SPIRV/memory-ops.mlir @@ -1,15 +1,25 @@ // RUN: mlir-translate -test-spirv-roundtrip -split-input-file %s | FileCheck %s -// CHECK: spv.func {{@.*}}([[ARG1:%.*]]: !spv.ptr, [[ARG2:%.*]]: !spv.ptr) "None" { -// CHECK-NEXT: [[VALUE:%.*]] = spv.Load "Input" [[ARG1]] : f32 -// CHECK-NEXT: spv.Store "Output" [[ARG2]], [[VALUE]] : f32 spv.module Logical GLSL450 requires #spv.vce { + // CHECK-LABEL: spv.func @load_store + // CHECK-SAME: ([[ARG1:%.*]]: !spv.ptr, [[ARG2:%.*]]: !spv.ptr) spv.func @load_store(%arg0 : !spv.ptr, %arg1 : !spv.ptr) "None" { + // CHECK-NEXT: [[VALUE:%.*]] = spv.Load "Input" [[ARG1]] : f32 %1 = spv.Load "Input" %arg0 : f32 + // CHECK-NEXT: spv.Store "Output" [[ARG2]], [[VALUE]] : f32 spv.Store "Output" %arg1, %1 : f32 spv.Return } + + // CHECK-LABEL: spv.func @load_store_memory_operands + spv.func @load_store_memory_operands(%arg0 : !spv.ptr, %arg1 : !spv.ptr) "None" { + // CHECK: spv.Load "Input" %{{.+}} ["Volatile|Aligned", 4] : f32 + %1 = spv.Load "Input" %arg0 ["Volatile|Aligned", 4]: f32 + // CHECK: spv.Store "Output" %{{.+}}, %{{.+}} ["Volatile|Aligned", 4] : f32 + spv.Store "Output" %arg1, %1 ["Volatile|Aligned", 4]: f32 + spv.Return + } } // ----- diff --git a/mlir/test/Transforms/test-toposort.mlir b/mlir/test/Transforms/test-toposort.mlir --- a/mlir/test/Transforms/test-toposort.mlir +++ b/mlir/test/Transforms/test-toposort.mlir @@ -1,27 +1,39 @@ // RUN: mlir-opt -topological-sort %s | FileCheck %s +// RUN: mlir-opt -test-topological-sort-analysis %s | FileCheck %s -check-prefix=CHECK-ANALYSIS // Test producer is after user. // CHECK-LABEL: test.graph_region -test.graph_region { +// CHECK-ANALYSIS-LABEL: test.graph_region +test.graph_region attributes{"root"} { // CHECK-NEXT: test.foo // CHECK-NEXT: test.baz // CHECK-NEXT: test.bar - %0 = "test.foo"() : () -> i32 - "test.bar"(%1, %0) : (i32, i32) -> () - %1 = "test.baz"() : () -> i32 + + // CHECK-ANALYSIS-NEXT: test.foo{{.*}} {pos = 0 + // CHECK-ANALYSIS-NEXT: test.bar{{.*}} {pos = 2 + // CHECK-ANALYSIS-NEXT: test.baz{{.*}} {pos = 1 + %0 = "test.foo"() {selected} : () -> i32 + "test.bar"(%1, %0) {selected} : (i32, i32) -> () + %1 = "test.baz"() {selected} : () -> i32 } // Test cycles. // CHECK-LABEL: test.graph_region -test.graph_region { +// CHECK-ANALYSIS-LABEL: test.graph_region +test.graph_region attributes{"root"} { // CHECK-NEXT: test.d // CHECK-NEXT: test.a // CHECK-NEXT: test.c // CHECK-NEXT: test.b - %2 = "test.c"(%1) : (i32) -> i32 + + // CHECK-ANALYSIS-NEXT: test.c{{.*}} {pos = 0 + // CHECK-ANALYSIS-NEXT: test.b{{.*}} : ( + // CHECK-ANALYSIS-NEXT: test.a{{.*}} {pos = 2 + // CHECK-ANALYSIS-NEXT: test.d{{.*}} {pos = 1 + %2 = "test.c"(%1) {selected} : (i32) -> i32 %1 = "test.b"(%0, %2) : (i32, i32) -> i32 - %0 = "test.a"(%3) : (i32) -> i32 - %3 = "test.d"() : () -> i32 + %0 = "test.a"(%3) {selected} : (i32) -> i32 + %3 = "test.d"() {selected} : () -> i32 } // Test block arguments. diff --git a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp --- a/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp +++ b/mlir/test/lib/Analysis/DataFlow/TestDenseDataFlowAnalysis.cpp @@ -61,9 +61,6 @@ /// The lattice is always initialized. bool isUninitialized() const override { return false; } - /// Initialize the lattice. Does nothing. - ChangeResult defaultInitialize() override { return ChangeResult::NoChange; } - /// Mark the lattice as having reached its pessimistic fixpoint. That is, the /// last modifications of all memory resources are unknown. ChangeResult reset() override { @@ -73,9 +70,6 @@ return ChangeResult::Change; } - /// The lattice is never at a fixpoint. - bool isAtFixpoint() const override { return false; } - /// Join the last modifications. ChangeResult join(const AbstractDenseLattice &lattice) override { const auto &rhs = static_cast(lattice); diff --git a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp --- a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp +++ b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp @@ -20,9 +20,6 @@ using AnalysisState::AnalysisState; - /// Default-initialize the state to zero. - ChangeResult defaultInitialize() override { return join(0); } - /// Returns true if the state is uninitialized. bool isUninitialized() const override { return !state; } diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -249,14 +249,16 @@ template static SmallVector -getGpuProcIds(OpBuilder &b, Location loc, ArrayRef parallelLoopRanges) { +getGpuProcIds(OpBuilder &b, Location loc, ArrayRef parallelLoopRanges, + ArrayRef distributionMethod) { size_t count = std::min(3, parallelLoopRanges.size()); SmallVector procInfo(count); Type indexType = b.getIndexType(); for (unsigned i = 0; i < count; ++i) { gpu::Dimension dim = *gpu::symbolizeDimension(i); procInfo[count - 1 - i] = {b.create(loc, indexType, dim), - b.create(loc, indexType, dim)}; + b.create(loc, indexType, dim), + distributionMethod[count - 1 - i]}; } return procInfo; } @@ -265,10 +267,15 @@ RewritePatternSet &patterns) { { LinalgLoopDistributionOptions cyclicNprocsEqNiters; - cyclicNprocsEqNiters.distributionMethod.resize( - 2, DistributionMethod::CyclicNumProcsEqNumIters); + SmallVector distributionMethod = { + DistributionMethod::CyclicNumProcsEqNumIters, + DistributionMethod::CyclicNumProcsEqNumIters}; cyclicNprocsEqNiters.procInfo = - getGpuProcIds; + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -282,10 +289,15 @@ { LinalgLoopDistributionOptions cyclicNprocsGeNiters; - cyclicNprocsGeNiters.distributionMethod.resize( - 2, DistributionMethod::CyclicNumProcsGeNumIters); + SmallVector distributionMethod = { + DistributionMethod::CyclicNumProcsGeNumIters, + DistributionMethod::CyclicNumProcsGeNumIters}; cyclicNprocsGeNiters.procInfo = - getGpuProcIds; + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -299,10 +311,14 @@ { LinalgLoopDistributionOptions cyclicNprocsDefault; - cyclicNprocsDefault.distributionMethod.resize(2, - DistributionMethod::Cyclic); + SmallVector distributionMethod = { + DistributionMethod::Cyclic, DistributionMethod::Cyclic}; cyclicNprocsDefault.procInfo = - getGpuProcIds; + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -316,10 +332,15 @@ { LinalgLoopDistributionOptions cyclicNprocsMixed1; - cyclicNprocsMixed1.distributionMethod = { + SmallVector distributionMethod = { DistributionMethod::CyclicNumProcsEqNumIters, DistributionMethod::CyclicNumProcsGeNumIters}; - cyclicNprocsMixed1.procInfo = getGpuProcIds; + cyclicNprocsMixed1.procInfo = + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -333,10 +354,15 @@ { LinalgLoopDistributionOptions cyclicNprocsMixed2; - cyclicNprocsMixed2.distributionMethod = { + SmallVector distributionMethod = { DistributionMethod::CyclicNumProcsGeNumIters, DistributionMethod::Cyclic}; - cyclicNprocsMixed2.procInfo = getGpuProcIds; + cyclicNprocsMixed2.procInfo = + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -350,10 +376,15 @@ { LinalgLoopDistributionOptions cyclicNprocsMixed3; - cyclicNprocsMixed3.distributionMethod = { + SmallVector distributionMethod = { DistributionMethod::Cyclic, DistributionMethod::CyclicNumProcsEqNumIters}; - cyclicNprocsMixed3.procInfo = getGpuProcIds; + cyclicNprocsMixed3.procInfo = + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, @@ -368,10 +399,14 @@ { LinalgLoopDistributionOptions cyclicNprocsEqNiters; - cyclicNprocsEqNiters.distributionMethod.resize(2, - DistributionMethod::Cyclic); + SmallVector distributionMethod = { + DistributionMethod::Cyclic, DistributionMethod::Cyclic}; cyclicNprocsEqNiters.procInfo = - getGpuProcIds; + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingOptions() @@ -387,8 +422,14 @@ static void fillTileFuseAndDistributePatterns(MLIRContext *context, RewritePatternSet &patterns) { LinalgLoopDistributionOptions cyclicNprocsEqNiters; - cyclicNprocsEqNiters.distributionMethod.resize(2, DistributionMethod::Cyclic); - cyclicNprocsEqNiters.procInfo = getGpuProcIds; + SmallVector distributionMethod = { + DistributionMethod::Cyclic, DistributionMethod::Cyclic}; + cyclicNprocsEqNiters.procInfo = + [distributionMethod](OpBuilder &b, Location loc, + ArrayRef parallelLoopRanges) { + return getGpuProcIds( + b, loc, parallelLoopRanges, distributionMethod); + }; patterns.add( MatmulOp::getOperationName(), context, LinalgTilingAndFusionOptions() diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -5,6 +5,7 @@ TestControlFlowSink.cpp TestInlining.cpp TestIntRangeInference.cpp + TestTopologicalSort.cpp EXCLUDE_FROM_LIBMLIR diff --git a/mlir/test/lib/Transforms/TestTopologicalSort.cpp b/mlir/test/lib/Transforms/TestTopologicalSort.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestTopologicalSort.cpp @@ -0,0 +1,62 @@ +//===- TestTopologicalSort.cpp - Pass to test topological sort analysis ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/TopologicalSortUtils.h" + +using namespace mlir; + +namespace { +struct TestTopologicalSortAnalysisPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTopologicalSortAnalysisPass) + + StringRef getArgument() const final { + return "test-topological-sort-analysis"; + } + StringRef getDescription() const final { + return "Test topological sorting of ops"; + } + + void runOnOperation() override { + Operation *op = getOperation(); + OpBuilder builder(op->getContext()); + + op->walk([&](Operation *root) { + if (!root->hasAttr("root")) + return WalkResult::advance(); + + assert(root->getNumRegions() == 1 && root->getRegion(0).hasOneBlock() && + "expected one block"); + Block *block = &root->getRegion(0).front(); + SmallVector selectedOps; + block->walk([&](Operation *op) { + if (op->hasAttr("selected")) + selectedOps.push_back(op); + }); + + computeTopologicalSorting(block, selectedOps); + for (const auto &it : llvm::enumerate(selectedOps)) + it.value()->setAttr("pos", builder.getIndexAttr(it.index())); + + return WalkResult::advance(); + }); + } +}; +} // namespace + +namespace mlir { +namespace test { +void registerTestTopologicalSortAnalysisPass() { + PassRegistration(); +} +} // namespace test +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -111,6 +111,7 @@ void registerTestSliceAnalysisPass(); void registerTestTensorTransforms(); void registerTestTilingInterface(); +void registerTestTopologicalSortAnalysisPass(); void registerTestTransformDialectInterpreterPass(); void registerTestVectorLowerings(); void registerTestNvgpuLowerings(); @@ -207,6 +208,7 @@ mlir::test::registerTestSliceAnalysisPass(); mlir::test::registerTestTensorTransforms(); mlir::test::registerTestTilingInterface(); + mlir::test::registerTestTopologicalSortAnalysisPass(); mlir::test::registerTestTransformDialectInterpreterPass(); mlir::test::registerTestVectorLowerings(); mlir::test::registerTestNvgpuLowerings(); diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp --- a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp +++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp @@ -518,7 +518,8 @@ os << tabs << formatv("if (auto attr = {0}->getAttr(\"{1}\")) {{\n", opVar, attrName); if (attr.getAttrDefName() == "SPV_ScopeAttr" || - attr.getAttrDefName() == "SPV_MemorySemanticsAttr") { + attr.getAttrDefName() == "SPV_MemorySemanticsAttr" || + attr.getAttrDefName() == "SPV_MatrixLayoutAttr") { // These two enums are encoded as to constant values in SPIR-V blob, // but we directly use the constant value as attribute in SPIR-V dialect. So // need to handle them separately from normal enum attributes. @@ -810,7 +811,8 @@ StringRef words, StringRef wordIndex, raw_ostream &os) { if (attr.getAttrDefName() == "SPV_ScopeAttr" || - attr.getAttrDefName() == "SPV_MemorySemanticsAttr") { + attr.getAttrDefName() == "SPV_MemorySemanticsAttr" || + attr.getAttrDefName() == "SPV_MatrixLayoutAttr") { // These two enums are encoded as to constant values in SPIR-V blob, // but we directly use the constant value as attribute in SPIR-V dialect. So // need to handle them separately from normal enum attributes. diff --git a/mlir/unittests/ExecutionEngine/CMakeLists.txt b/mlir/unittests/ExecutionEngine/CMakeLists.txt --- a/mlir/unittests/ExecutionEngine/CMakeLists.txt +++ b/mlir/unittests/ExecutionEngine/CMakeLists.txt @@ -1,4 +1,5 @@ add_mlir_unittest(MLIRExecutionEngineTests + DynamicMemRef.cpp Invoke.cpp ) get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) diff --git a/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp b/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp new file mode 100644 --- /dev/null +++ b/mlir/unittests/ExecutionEngine/DynamicMemRef.cpp @@ -0,0 +1,99 @@ +//===- DynamicMemRef.cpp ----------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/ExecutionEngine/CRunnerUtils.h" +#include "llvm/ADT/SmallVector.h" + +#include "gmock/gmock.h" + +using namespace ::mlir; +using namespace ::testing; + +TEST(DynamicMemRef, rankZero) { + int data = 57; + + StridedMemRefType memRef; + memRef.basePtr = &data; + memRef.data = &data; + memRef.offset = 0; + + DynamicMemRefType dynamicMemRef(memRef); + + llvm::SmallVector values(dynamicMemRef.begin(), dynamicMemRef.end()); + EXPECT_THAT(values, ElementsAre(57)); +} + +TEST(DynamicMemRef, rankOne) { + std::array data; + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = i; + } + + StridedMemRefType memRef; + memRef.basePtr = data.data(); + memRef.data = data.data(); + memRef.offset = 0; + memRef.sizes[0] = 3; + memRef.strides[0] = 1; + + DynamicMemRefType dynamicMemRef(memRef); + + llvm::SmallVector values(dynamicMemRef.begin(), dynamicMemRef.end()); + EXPECT_THAT(values, ElementsAreArray(data)); + + for (int64_t i = 0; i < 3; ++i) { + EXPECT_EQ(*dynamicMemRef[i], data[i]); + } +} + +TEST(DynamicMemRef, rankTwo) { + std::array data; + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = i; + } + + StridedMemRefType memRef; + memRef.basePtr = data.data(); + memRef.data = data.data(); + memRef.offset = 0; + memRef.sizes[0] = 2; + memRef.sizes[1] = 3; + memRef.strides[0] = 3; + memRef.strides[1] = 1; + + DynamicMemRefType dynamicMemRef(memRef); + + llvm::SmallVector values(dynamicMemRef.begin(), dynamicMemRef.end()); + EXPECT_THAT(values, ElementsAreArray(data)); +} + +TEST(DynamicMemRef, rankThree) { + std::array data; + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = i; + } + + StridedMemRefType memRef; + memRef.basePtr = data.data(); + memRef.data = data.data(); + memRef.offset = 0; + memRef.sizes[0] = 2; + memRef.sizes[1] = 3; + memRef.sizes[2] = 4; + memRef.strides[0] = 12; + memRef.strides[1] = 4; + memRef.strides[2] = 1; + + DynamicMemRefType dynamicMemRef(memRef); + + llvm::SmallVector values(dynamicMemRef.begin(), dynamicMemRef.end()); + EXPECT_THAT(values, ElementsAreArray(data)); +} \ No newline at end of file diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -580,7 +580,7 @@ int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo) { - if (!RTL->run_region || !RTL->synchronize) + if (!RTL->run_region_async || !RTL->synchronize) return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, TgtVarsSize); return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc --- a/third-party/benchmark/src/sysinfo.cc +++ b/third-party/benchmark/src/sysinfo.cc @@ -12,6 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if defined(_MSC_VER) +// FIXME: This must be defined before any other includes to disable deprecation +// warnings for use of codecvt from C++17. We should remove our reliance on +// the deprecated functionality instead. +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#endif + #include "internal_macros.h" #ifdef BENCHMARK_OS_WINDOWS