Index: cmake/modules/HandleLLVMOptions.cmake =================================================================== --- cmake/modules/HandleLLVMOptions.cmake +++ cmake/modules/HandleLLVMOptions.cmake @@ -642,6 +642,9 @@ append_common_sanitizer_flags() append("-fsanitize=address,undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + elseif (LLVM_USE_SANITIZER STREQUAL "Leaks") + append_common_sanitizer_flags() + append("-fsanitize=leak" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) else() message(FATAL_ERROR "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}") endif() Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -475,6 +475,33 @@ def int_amdgcn_buffer_store_format : AMDGPUBufferStore; def int_amdgcn_buffer_store : AMDGPUBufferStore; +def int_amdgcn_tbuffer_load : Intrinsic < + [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + []>; + +def int_amdgcn_tbuffer_store : Intrinsic < + [], + [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // voffset(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + []>; + class AMDGPUBufferAtomic : Intrinsic < [llvm_i32_ty], [llvm_i32_ty, // vdata(VGPR) Index: include/llvm/IR/Statepoint.h =================================================================== --- include/llvm/IR/Statepoint.h +++ include/llvm/IR/Statepoint.h @@ -62,7 +62,10 @@ bool isStatepoint(const Value &V); bool isGCRelocate(ImmutableCallSite CS); +bool isGCRelocate(const Value *V); + bool isGCResult(ImmutableCallSite CS); +bool isGCResult(const Value *V); /// Analogous to CallSiteBase, this provides most of the actual /// functionality for Statepoint and ImmutableStatepoint. It is Index: include/llvm/Option/OptParser.td =================================================================== --- include/llvm/Option/OptParser.td +++ include/llvm/Option/OptParser.td @@ -92,6 +92,7 @@ int NumArgs = 0; string HelpText = ?; string MetaVarName = ?; + string Values = ?; list Flags = []; OptionGroup Group = ?; Option Alias = ?; @@ -126,6 +127,7 @@ class Group { OptionGroup Group = group; } class HelpText { string HelpText = text; } class MetaVarName { string MetaVarName = name; } +class Values { string Values = value; } // Predefined options. Index: include/llvm/Option/OptTable.h =================================================================== --- include/llvm/Option/OptTable.h +++ include/llvm/Option/OptTable.h @@ -53,6 +53,7 @@ unsigned short GroupID; unsigned short AliasID; const char *AliasArgs; + const char *Values; }; private: @@ -120,6 +121,19 @@ return getInfo(id).MetaVar; } + /// Find possible value for given flags. This is used for shell + /// autocompletion. + /// + /// \param [in] Option - Key flag like "-stdlib=" when "-stdlib=l" + /// was passed to clang. + /// + /// \param [in] Arg - Value which we want to autocomplete like "l" + /// when "-stdlib=l" was passed to clang. + /// + /// \return The vector of possible values. + std::vector suggestValueCompletions(StringRef Option, + StringRef Arg) const; + /// Find flags from OptTable which starts with Cur. /// /// \param [in] Cur - String prefix that all returned flags need Index: include/llvm/Option/Option.h =================================================================== --- include/llvm/Option/Option.h +++ include/llvm/Option/Option.h @@ -57,6 +57,7 @@ UnknownClass, FlagClass, JoinedClass, + ValuesClass, SeparateClass, RemainingArgsClass, RemainingArgsJoinedClass, @@ -155,6 +156,7 @@ case CommaJoinedClass: return RenderCommaJoinedStyle; case FlagClass: + case ValuesClass: case SeparateClass: case MultiArgClass: case JoinedOrSeparateClass: Index: include/llvm/Support/Error.h =================================================================== --- include/llvm/Support/Error.h +++ include/llvm/Support/Error.h @@ -1076,6 +1076,27 @@ llvm_unreachable("Failure value returned from cantFail wrapped call"); } +/// Report a fatal error if ValOrErr is a failure value, otherwise unwraps and +/// returns the contained reference. +/// +/// This function can be used to wrap calls to fallible functions ONLY when it +/// is known that the Error will always be a success value. E.g. +/// +/// @code{.cpp} +/// // foo only attempts the fallible operation if DoFallibleOperation is +/// // true. If DoFallibleOperation is false then foo always returns a Bar&. +/// Expected foo(bool DoFallibleOperation); +/// +/// Bar &X = cantFail(foo(false)); +/// @endcode +template +T& cantFail(Expected ValOrErr) { + if (ValOrErr) + return *ValOrErr; + else + llvm_unreachable("Failure value returned from cantFail wrapped call"); +} + } // end namespace llvm #endif // LLVM_SUPPORT_ERROR_H Index: include/llvm/Target/GenericOpcodes.td =================================================================== --- include/llvm/Target/GenericOpcodes.td +++ include/llvm/Target/GenericOpcodes.td @@ -386,6 +386,15 @@ let isCommutable = 1; } +// Generic fused multiply-add instruction. +// Behaves like llvm fma intrinsic ie src1 * src2 + src3 +def G_FMA : Instruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3); + let hasSideEffects = 0; + let isCommutable = 0; +} + // Generic FP division. def G_FDIV : Instruction { let OutOperandList = (outs type0:$dst); Index: include/llvm/Target/TargetOpcodes.def =================================================================== --- include/llvm/Target/TargetOpcodes.def +++ include/llvm/Target/TargetOpcodes.def @@ -359,6 +359,9 @@ /// Generic FP multiplication. HANDLE_TARGET_OPCODE(G_FMUL) +/// Generic FMA multiplication. Behaves like llvm fma intrinsic +HANDLE_TARGET_OPCODE(G_FMA) + /// Generic FP division. HANDLE_TARGET_OPCODE(G_FDIV) Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -126,7 +126,7 @@ static cl::opt MulOpsInlineThreshold( "scev-mulops-inline-threshold", cl::Hidden, cl::desc("Threshold for inlining multiplication operands into a SCEV"), - cl::init(1000)); + cl::init(32)); static cl::opt AddOpsInlineThreshold( "scev-addops-inline-threshold", cl::Hidden, Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2801,26 +2801,24 @@ } // Before we switch over, we force a reference to a label inside the - // xray_instr_map and xray_fn_idx sections. Since this function is always - // called just before the function's end, we assume that this is happening - // after the last return instruction. We also use the synthetic label in the - // xray_inster_map as a delimeter for the range of sleds for this function in - // the index. + // xray_fn_idx sections. This makes sure that the xray_fn_idx section is kept + // live by the linker if the function is not garbage-collected. Since this + // function is always called just before the function's end, we assume that + // this is happening after the last return instruction. auto WordSizeBytes = MAI->getCodePointerSize(); - MCSymbol *SledsStart = OutContext.createTempSymbol("xray_synthetic_", true); MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true); OutStreamer->EmitCodeAlignment(16); - OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes, false); OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false); // Now we switch to the instrumentation map section. Because this is done // per-function, we are able to create an index entry that will represent the // range of sleds associated with a function. + MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true); OutStreamer->SwitchSection(InstMap); OutStreamer->EmitLabel(SledsStart); for (const auto &Sled : Sleds) Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym); - MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_synthetic_end", true); + MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true); OutStreamer->EmitLabel(SledsEnd); // We then emit a single entry in the index per function. We use the symbols Index: lib/CodeGen/AsmPrinter/DwarfDebug.cpp =================================================================== --- lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1000,12 +1000,14 @@ if (Pred->getFlag(MachineInstr::FrameSetup)) break; auto PredDL = Pred->getDebugLoc(); - if (!PredDL || Pred->isDebugValue()) + if (!PredDL || Pred->isMetaInstruction()) continue; // Check whether the instruction preceding the DBG_VALUE is in the same // (sub)scope as the DBG_VALUE. - if (DL->getScope() == PredDL->getScope() || - LScope->dominates(LScopes.findLexicalScope(PredDL))) + if (DL->getScope() == PredDL->getScope()) + return false; + auto *PredScope = LScopes.findLexicalScope(PredDL); + if (!PredScope || LScope->dominates(PredScope)) return false; } Index: lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- lib/CodeGen/GlobalISel/IRTranslator.cpp +++ lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -686,6 +686,13 @@ .addUse(getOrCreateVReg(*CI.getArgOperand(0))) .addUse(getOrCreateVReg(*CI.getArgOperand(1))); return true; + case Intrinsic::fma: + MIRBuilder.buildInstr(TargetOpcode::G_FMA) + .addDef(getOrCreateVReg(CI)) + .addUse(getOrCreateVReg(*CI.getArgOperand(0))) + .addUse(getOrCreateVReg(*CI.getArgOperand(1))) + .addUse(getOrCreateVReg(*CI.getArgOperand(2))); + return true; case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: Index: lib/CodeGen/ImplicitNullChecks.cpp =================================================================== --- lib/CodeGen/ImplicitNullChecks.cpp +++ lib/CodeGen/ImplicitNullChecks.cpp @@ -359,30 +359,15 @@ Offset < PageSize)) return SR_Unsuitable; - // Finally, we need to make sure that the access instruction actually is - // accessing from PointerReg, and there isn't some re-definition of PointerReg - // between the compare and the memory access. - // If PointerReg has been redefined before then there is no sense to continue - // lookup due to this condition will fail for any further instruction. - SuitabilityResult Suitable = SR_Suitable; - for (auto *PrevMI : PrevInsts) - for (auto &PrevMO : PrevMI->operands()) { - if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() && - TRI->regsOverlap(PrevMO.getReg(), PointerReg)) - return SR_Impossible; - - // Check whether the current memory access aliases with previous one. - // If we already found that it aliases then no need to continue. - // But we continue base pointer check as it can result in SR_Impossible. - if (Suitable == SR_Suitable) { - AliasResult AR = areMemoryOpsAliased(MI, PrevMI); - if (AR == AR_WillAliasEverything) - return SR_Impossible; - if (AR == AR_MayAlias) - Suitable = SR_Unsuitable; - } - } - return Suitable; + // Finally, check whether the current memory access aliases with previous one. + for (auto *PrevMI : PrevInsts) { + AliasResult AR = areMemoryOpsAliased(MI, PrevMI); + if (AR == AR_WillAliasEverything) + return SR_Impossible; + if (AR == AR_MayAlias) + return SR_Unsuitable; + } + return SR_Suitable; } bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI, @@ -569,6 +554,12 @@ return true; } + // If MI re-defines the PointerReg then we cannot move further. + if (any_of(MI.operands(), [&](MachineOperand &MO) { + return MO.isReg() && MO.getReg() && MO.isDef() && + TRI->regsOverlap(MO.getReg(), PointerReg); + })) + return false; InstsSeenSoFar.push_back(&MI); } Index: lib/CodeGen/RegisterScavenging.cpp =================================================================== --- lib/CodeGen/RegisterScavenging.cpp +++ lib/CodeGen/RegisterScavenging.cpp @@ -372,60 +372,62 @@ /// clobbered for the longest time. /// Returns the register and the earliest position we know it to be free or /// the position MBB.end() if no register is available. -static std::pair -findSurvivorBackwards(const TargetRegisterInfo &TRI, +static std::pair +findSurvivorBackwards(const MachineRegisterInfo &MRI, MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, - BitVector &Available, BitVector &Candidates) { + const LiveRegUnits &LiveOut, ArrayRef AllocationOrder) { bool FoundTo = false; - unsigned Survivor = 0; + MCPhysReg Survivor = 0; MachineBasicBlock::iterator Pos; MachineBasicBlock &MBB = *From->getParent(); unsigned InstrLimit = 25; unsigned InstrCountDown = InstrLimit; + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + LiveRegUnits Used(TRI); + for (MachineBasicBlock::iterator I = From;; --I) { const MachineInstr &MI = *I; - // Remove any candidates touched by instruction. - bool FoundVReg = false; - for (const MachineOperand &MO : MI.operands()) { - if (MO.isRegMask()) { - Candidates.clearBitsNotInMask(MO.getRegMask()); - continue; - } - if (!MO.isReg() || MO.isUndef() || MO.isDebug()) - continue; - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - FoundVReg = true; - } else if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) - Candidates.reset(*AI); - } - } + Used.accumulateBackward(MI); if (I == To) { - // If one of the available registers survived this long take it. - Available &= Candidates; - int Reg = Available.find_first(); - if (Reg != -1) - return std::make_pair(Reg, MBB.end()); + // See if one of the registers in RC wasn't used so far. + for (MCPhysReg Reg : AllocationOrder) { + if (!MRI.isReserved(Reg) && Used.available(Reg) && + LiveOut.available(Reg)) + return std::make_pair(Reg, MBB.end()); + } // Otherwise we will continue up to InstrLimit instructions to find // the register which is not defined/used for the longest time. FoundTo = true; Pos = To; } if (FoundTo) { - if (Survivor == 0 || !Candidates.test(Survivor)) { - int Reg = Candidates.find_first(); - if (Reg == -1) + if (Survivor == 0 || !Used.available(Survivor)) { + MCPhysReg AvilableReg = 0; + for (MCPhysReg Reg : AllocationOrder) { + if (!MRI.isReserved(Reg) && Used.available(Reg)) { + AvilableReg = Reg; + break; + } + } + if (AvilableReg == 0) break; - Survivor = Reg; + Survivor = AvilableReg; } if (--InstrCountDown == 0) break; + + // Keep searching when we find a vreg since the spilled register will + // be usefull for this other vreg as well later. + bool FoundVReg = false; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + FoundVReg = true; + break; + } + } if (FoundVReg) { - // Keep searching when we find a vreg since the spilled register will - // be usefull for this other vreg as well later. InstrCountDown = InstrLimit; Pos = I; } @@ -568,18 +570,13 @@ bool RestoreAfter, int SPAdj) { const MachineBasicBlock &MBB = *To->getParent(); const MachineFunction &MF = *MBB.getParent(); - // Consider all allocatable registers in the register class initially - BitVector Candidates = TRI->getAllocatableSet(MF, &RC); - - // Try to find a register that's unused if there is one, as then we won't - // have to spill. - BitVector Available = getRegsAvailable(&RC); // Find the register whose use is furthest away. MachineBasicBlock::iterator UseMI; - std::pair P = - findSurvivorBackwards(*TRI, MBBI, To, Available, Candidates); - unsigned Reg = P.first; + ArrayRef AllocationOrder = RC.getRawAllocationOrder(MF); + std::pair P = + findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder); + MCPhysReg Reg = P.first; MachineBasicBlock::iterator SpillBefore = P.second; assert(Reg != 0 && "No register left to scavenge!"); // Found an available register? Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14055,6 +14055,11 @@ // when we start sorting the vectors by type. return SDValue(); } + } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && + InVT1.getSizeInBits() == VT.getSizeInBits()) { + SmallVector ConcatOps(2, DAG.getUNDEF(InVT2)); + ConcatOps[0] = VecIn2; + VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } else { // TODO: Support cases where the length mismatch isn't exactly by a // factor of 2. Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3530,17 +3530,24 @@ LC = RTLIB::MUL_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!"); - // The high part is obtained by SRA'ing all but one of the bits of low - // part. - unsigned LoSize = VT.getSizeInBits(); - SDValue HiLHS = - DAG.getNode(ISD::SRA, dl, VT, LHS, - DAG.getConstant(LoSize - 1, dl, - TLI.getPointerTy(DAG.getDataLayout()))); - SDValue HiRHS = - DAG.getNode(ISD::SRA, dl, VT, RHS, - DAG.getConstant(LoSize - 1, dl, - TLI.getPointerTy(DAG.getDataLayout()))); + SDValue HiLHS; + SDValue HiRHS; + if (isSigned) { + // The high part is obtained by SRA'ing all but one of the bits of low + // part. + unsigned LoSize = VT.getSizeInBits(); + HiLHS = + DAG.getNode(ISD::SRA, dl, VT, LHS, + DAG.getConstant(LoSize - 1, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + HiRHS = + DAG.getNode(ISD::SRA, dl, VT, RHS, + DAG.getConstant(LoSize - 1, dl, + TLI.getPointerTy(DAG.getDataLayout()))); + } else { + HiLHS = DAG.getConstant(0, dl, VT); + HiRHS = DAG.getConstant(0, dl, VT); + } // Here we're passing the 2 arguments explicitly as 4 arguments that are // pre-lowered to the correct types. This all depends upon WideVT not Index: lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp =================================================================== --- lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp +++ lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp @@ -51,6 +51,7 @@ uint32_t ModIndex, msf::MSFBuilder &Msf) : MSF(Msf), ModuleName(ModuleName) { + ::memset(&Layout, 0, sizeof(Layout)); Layout.Mod = ModIndex; } @@ -102,6 +103,7 @@ template Foo makeFoo(T &&t) { return Foo(std::move(t)); } void DbiModuleDescriptorBuilder::finalize() { + Layout.SC.ModuleIndex = Layout.Mod; Layout.FileNameOffs = 0; // TODO: Fix this Layout.Flags = 0; // TODO: Fix this Layout.C11Bytes = 0; Index: lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp =================================================================== --- lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp +++ lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp @@ -237,6 +237,7 @@ return EC; DbiStreamHeader *H = Allocator.Allocate(); + ::memset(H, 0, sizeof(DbiStreamHeader)); H->VersionHeader = *VerHeader; H->VersionSignature = -1; H->Age = Age; Index: lib/IR/Statepoint.cpp =================================================================== --- lib/IR/Statepoint.cpp +++ lib/IR/Statepoint.cpp @@ -44,10 +44,22 @@ return CS.getInstruction() && isa(CS.getInstruction()); } +bool llvm::isGCRelocate(const Value *V) { + if (auto CS = ImmutableCallSite(V)) + return isGCRelocate(CS); + return false; +} + bool llvm::isGCResult(ImmutableCallSite CS) { return CS.getInstruction() && isa(CS.getInstruction()); } +bool llvm::isGCResult(const Value *V) { + if (auto CS = ImmutableCallSite(V)) + return isGCResult(CS); + return false; +} + bool llvm::isStatepointDirectiveAttr(Attribute Attr) { return Attr.hasAttribute("statepoint-id") || Attr.hasAttribute("statepoint-num-patch-bytes"); Index: lib/ObjectYAML/CodeViewYAMLSymbols.cpp =================================================================== --- lib/ObjectYAML/CodeViewYAMLSymbols.cpp +++ lib/ObjectYAML/CodeViewYAMLSymbols.cpp @@ -286,16 +286,15 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Print the linkage name - - IO.mapRequired("PtrParent", Symbol.Parent); - IO.mapRequired("PtrEnd", Symbol.End); - IO.mapRequired("PtrNext", Symbol.Next); + IO.mapOptional("PtrParent", Symbol.Parent, 0U); + IO.mapOptional("PtrEnd", Symbol.End, 0U); + IO.mapOptional("PtrNext", Symbol.Next, 0U); IO.mapRequired("CodeSize", Symbol.CodeSize); IO.mapRequired("DbgStart", Symbol.DbgStart); IO.mapRequired("DbgEnd", Symbol.DbgEnd); IO.mapRequired("FunctionType", Symbol.FunctionType); - IO.mapRequired("Segment", Symbol.Segment); + IO.mapOptional("Offset", Symbol.CodeOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("Flags", Symbol.Flags); IO.mapRequired("DisplayName", Symbol.Name); } @@ -308,8 +307,8 @@ template <> void SymbolRecordImpl::map(IO &IO) { IO.mapRequired("Flags", Symbol.Flags); - IO.mapRequired("Seg", Symbol.Segment); - IO.mapRequired("Off", Symbol.Offset); + IO.mapOptional("Offset", Symbol.Offset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("Name", Symbol.Name); } @@ -325,8 +324,8 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - IO.mapRequired("PtrParent", Symbol.Parent); - IO.mapRequired("PtrEnd", Symbol.End); + IO.mapOptional("PtrParent", Symbol.Parent, 0U); + IO.mapOptional("PtrEnd", Symbol.End, 0U); IO.mapRequired("Inlinee", Symbol.Inlinee); // TODO: The binary annotations } @@ -368,17 +367,17 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Print the linkage name - IO.mapRequired("PtrParent", Symbol.Parent); - IO.mapRequired("PtrEnd", Symbol.End); + IO.mapOptional("PtrParent", Symbol.Parent, 0U); + IO.mapOptional("PtrEnd", Symbol.End, 0U); IO.mapRequired("CodeSize", Symbol.CodeSize); - IO.mapRequired("Segment", Symbol.Segment); + IO.mapOptional("Offset", Symbol.CodeOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("BlockName", Symbol.Name); } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Print the linkage name - IO.mapRequired("Segment", Symbol.Segment); + IO.mapOptional("Offset", Symbol.CodeOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("Flags", Symbol.Flags); IO.mapRequired("Flags", Symbol.Flags); IO.mapRequired("DisplayName", Symbol.Name); @@ -428,8 +427,8 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Map Linkage Name - IO.mapRequired("Segment", Symbol.Segment); + IO.mapOptional("Offset", Symbol.CodeOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("Type", Symbol.Type); } @@ -441,14 +440,13 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Map Linkage Name - IO.mapRequired("Segment", Symbol.Segment); + IO.mapOptional("Offset", Symbol.CodeOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize); IO.mapRequired("Type", Symbol.Type); } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Map Linkage Name IO.mapRequired("Register", Symbol.Register); IO.mapRequired("CookieKind", Symbol.CookieKind); IO.mapRequired("Flags", Symbol.Flags); @@ -487,14 +485,16 @@ } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Map linkage name IO.mapRequired("Type", Symbol.Type); + IO.mapOptional("Offset", Symbol.DataOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("DisplayName", Symbol.Name); } template <> void SymbolRecordImpl::map(IO &IO) { - // TODO: Map linkage name IO.mapRequired("Type", Symbol.Type); + IO.mapOptional("Offset", Symbol.DataOffset, 0U); + IO.mapOptional("Segment", Symbol.Segment, uint16_t(0)); IO.mapRequired("DisplayName", Symbol.Name); } } Index: lib/Option/OptTable.cpp =================================================================== --- lib/Option/OptTable.cpp +++ lib/Option/OptTable.cpp @@ -194,6 +194,37 @@ return 0; } +// Returns true if one of the Prefixes + In.Names matches Option +static bool optionMatches(const OptTable::Info &In, StringRef Option) { + if (In.Values && In.Prefixes) + for (size_t I = 0; In.Prefixes[I]; I++) + if (Option == std::string(In.Prefixes[I]) + In.Name) + return true; + return false; +} + +// This function is for flag value completion. +// Eg. When "-stdlib=" and "l" was passed to this function, it will return +// appropiriate values for stdlib, which starts with l. +std::vector +OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const { + // Search all options and return possible values. + for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) { + if (!optionMatches(In, Option)) + continue; + + SmallVector Candidates; + StringRef(In.Values).split(Candidates, ",", -1, false); + + std::vector Result; + for (StringRef Val : Candidates) + if (Val.startswith(Arg)) + Result.push_back(Val); + return Result; + } + return {}; +} + std::vector OptTable::findByPrefix(StringRef Cur) const { std::vector Ret; for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) { @@ -336,6 +367,9 @@ case Option::FlagClass: break; + case Option::ValuesClass: + break; + case Option::SeparateClass: case Option::JoinedOrSeparateClass: case Option::RemainingArgsClass: case Option::RemainingArgsJoinedClass: Name += ' '; Index: lib/Option/Option.cpp =================================================================== --- lib/Option/Option.cpp +++ lib/Option/Option.cpp @@ -47,6 +47,7 @@ P(UnknownClass); P(FlagClass); P(JoinedClass); + P(ValuesClass); P(SeparateClass); P(CommaJoinedClass); P(MultiArgClass); Index: lib/Support/Unix/Program.inc =================================================================== --- lib/Support/Unix/Program.inc +++ lib/Support/Unix/Program.inc @@ -449,11 +449,22 @@ size_t ArgLength = Program.size() + 1; for (ArrayRef::iterator I = Args.begin(), E = Args.end(); I != E; ++I) { - ArgLength += strlen(*I) + 1; + size_t length = strlen(*I); + + // Ensure that we do not exceed the MAX_ARG_STRLEN constant on Linux, which + // does not have a constant unlike what the man pages would have you + // believe. Since this limit is pretty high, perform the check + // unconditionally rather than trying to be aggressive and limiting it to + // Linux only. + if (length >= (32 * 4096)) + return false; + + ArgLength += length + 1; if (ArgLength > size_t(HalfArgMax)) { return false; } } + return true; } } Index: lib/Support/raw_ostream.cpp =================================================================== --- lib/Support/raw_ostream.cpp +++ lib/Support/raw_ostream.cpp @@ -548,7 +548,11 @@ pos += Size; #ifndef LLVM_ON_WIN32 +#if defined(__linux__) + bool ShouldWriteInChunks = true; +#else bool ShouldWriteInChunks = false; +#endif #else // Writing a large size of output to Windows console returns ENOMEM. It seems // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and Index: lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp =================================================================== --- lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -43,26 +43,25 @@ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { - // This table *must* be in the order that the fixup_* kinds are defined in - // AArch64FixupKinds.h. - // - // Name Offset (bits) Size (bits) Flags - { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_add_imm12", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 }, - { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_movw", 5, 16, 0 }, - { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_tlsdesc_call", 0, 0, 0 } - }; + // This table *must* be in the order that the fixup_* kinds are defined + // in AArch64FixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_add_imm12", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0}, + {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_movw", 5, 16, 0}, + {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_tlsdesc_call", 0, 0, 0}}; if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -644,7 +644,11 @@ "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<"FeatureCIInsts">; -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, + AssemblerPredicate<"FeatureFlatAddressSpace">; + +def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, + AssemblerPredicate<"FeatureFlatGlobalInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -403,6 +403,8 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + TBUFFER_STORE_FORMAT_X3, + TBUFFER_LOAD_FORMAT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3664,6 +3664,8 @@ NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -75,8 +75,10 @@ return TTI::PSK_FastHardware; } - unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector) const; + unsigned getHardwareNumberOfRegisters(bool Vector) const; + unsigned getNumberOfRegisters(bool Vector) const; + unsigned getRegisterBitWidth(bool Vector) const ; + unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -184,9 +184,9 @@ } } -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { - if (Vec) - return 0; +unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { + // The concept of vector registers doesn't really exist. Some packed vector + // operations operate on the normal 32-bit registers. // Number of VGPRs on SI. if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -195,8 +195,18 @@ return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { + // This is really the number of registers to fill when vectorizing / + // interleaving loops, so we lie to avoid trying to use all registers. + return getHardwareNumberOfRegisters(Vec) >> 3; +} + unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { - return Vector ? 0 : 32; + return 32; +} + +unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { + return 32; } unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { @@ -247,11 +257,11 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Disable unrolling if the loop is not vectorized. + // TODO: Enable this again. if (VF == 1) return 1; - // Semi-arbitrary large amount. - return 64; + return 8; } int AMDGPUTTIImpl::getArithmeticInstrCost( Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -152,6 +152,8 @@ ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, + ImmTyDFMT, + ImmTyNFMT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, @@ -292,6 +294,8 @@ bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } + bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } @@ -636,6 +640,8 @@ case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; + case ImmTyDFMT: OS << "DFMT"; break; + case ImmTyNFMT: OS << "NFMT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; @@ -1029,6 +1035,8 @@ void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); + AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; AMDGPUOperand::Ptr defaultTFE() const; @@ -1042,6 +1050,7 @@ AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; AMDGPUOperand::Ptr defaultOffsetU12() const; + AMDGPUOperand::Ptr defaultOffsetS13() const; OperandMatchResultTy parseOModOperand(OperandVector &Operands); @@ -2554,11 +2563,21 @@ return MatchOperand_ParseFail; Parser.Lex(); + + bool IsMinus = false; + if (getLexer().getKind() == AsmToken::Minus) { + Parser.Lex(); + IsMinus = true; + } + if (getLexer().isNot(AsmToken::Integer)) return MatchOperand_ParseFail; if (getParser().parseAbsoluteExpression(Int)) return MatchOperand_ParseFail; + + if (IsMinus) + Int = -Int; break; } } @@ -3743,6 +3762,44 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } +void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); +} + //===----------------------------------------------------------------------===// // mimg //===----------------------------------------------------------------------===// @@ -3870,6 +3927,10 @@ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + //===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -3919,6 +3980,8 @@ {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, + {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -57,6 +57,11 @@ string OpName = NAME # suffix; } +class MTBUFAddr64Table { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + //===----------------------------------------------------------------------===// // MTBUF classes //===----------------------------------------------------------------------===// @@ -78,14 +83,31 @@ let EXP_CNT = 1; let MTBUF = 1; let Uses = [EXEC]; - let hasSideEffects = 0; let SchedRW = [WriteVMEM]; + + let AsmMatchConverter = "cvtMtbuf"; + + bits<1> offen = 0; + bits<1> idxen = 0; + bits<1> addr64 = 0; + bits<1> has_vdata = 1; + bits<1> has_vaddr = 1; + bits<1> has_glc = 1; + bits<1> glc_value = 0; // the value for glc if no such operand + bits<4> dfmt_value = 1; // the value for dfmt if no such operand + bits<3> nfmt_value = 0; // the value for nfmt if no such operand + bits<1> has_srsrc = 1; + bits<1> has_soffset = 1; + bits<1> has_offset = 1; + bits<1> has_slc = 1; + bits<1> has_tfe = 1; + bits<1> has_dfmt = 1; + bits<1> has_nfmt = 1; } class MTBUF_Real : - InstSI , - Enc64 { + InstSI { let isPseudo = 0; let isCodeGenOnly = 0; @@ -97,57 +119,168 @@ let DisableEncoding = ps.DisableEncoding; let TSFlags = ps.TSFlags; - bits<8> vdata; bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class getMTBUFInsDA vdataList, + list vaddrList=[]> { + RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); + RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + dag InsNoData = !if(!empty(vaddrList), + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe), + (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe) + ); + dag InsData = !if(!empty(vaddrList), + (ins vdataClass:$vdata, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe), + (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + slc:$slc, tfe:$tfe) + ); + dag ret = !if(!empty(vdataList), InsNoData, InsData); } -class MTBUF_Load_Pseudo : MTBUF_Pseudo < - opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +class getMTBUFIns vdataList=[]> { + dag ret = + !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA.ret, + (ins)))))); +} + +class getMTBUFAsmOps { + string Pfx = + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset", + !if(!eq(addrKind, BUFAddrKind.OffEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen", + !if(!eq(addrKind, BUFAddrKind.IdxEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen", + !if(!eq(addrKind, BUFAddrKind.BothEn), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen", + !if(!eq(addrKind, BUFAddrKind.Addr64), + "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64", + ""))))); + string ret = Pfx # "$offset"; +} + +class MTBUF_SetupAddr { + bits<1> offen = !if(!eq(addrKind, BUFAddrKind.OffEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> idxen = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1, + !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0)); + + bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0); + + bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1); +} + +class MTBUF_Load_Pseudo pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind> + : MTBUF_Pseudo.ret, + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr { + let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 1; let mayStore = 0; } -class MTBUF_Store_Pseudo : MTBUF_Pseudo < - opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset), - " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"# - " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> { +multiclass MTBUF_Pseudo_Loads { + + def _OFFSET : MTBUF_Load_Pseudo , + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Load_Pseudo , + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Load_Pseudo ; + def _IDXEN : MTBUF_Load_Pseudo ; + def _BOTHEN : MTBUF_Load_Pseudo ; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Load_Pseudo ; + def _OFFEN_exact : MTBUF_Load_Pseudo ; + def _IDXEN_exact : MTBUF_Load_Pseudo ; + def _BOTHEN_exact : MTBUF_Load_Pseudo ; + } +} + +class MTBUF_Store_Pseudo pattern=[], + // Workaround bug bz30254 + int addrKindCopy = addrKind, + RegisterClass vdataClassCopy = vdataClass> + : MTBUF_Pseudo.ret, + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe", + pattern>, + MTBUF_SetupAddr { + let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 0; let mayStore = 1; } +multiclass MTBUF_Pseudo_Stores { + + def _OFFSET : MTBUF_Store_Pseudo , + MTBUFAddr64Table<0>; + + def _ADDR64 : MTBUF_Store_Pseudo , + MTBUFAddr64Table<1>; + + def _OFFEN : MTBUF_Store_Pseudo ; + def _IDXEN : MTBUF_Store_Pseudo ; + def _BOTHEN : MTBUF_Store_Pseudo ; + + let DisableWQM = 1 in { + def _OFFSET_exact : MTBUF_Store_Pseudo ; + def _OFFEN_exact : MTBUF_Store_Pseudo ; + def _IDXEN_exact : MTBUF_Store_Pseudo ; + def _BOTHEN_exact : MTBUF_Store_Pseudo ; + } +} + + //===----------------------------------------------------------------------===// // MUBUF classes //===----------------------------------------------------------------------===// @@ -676,14 +809,14 @@ // MTBUF Instructions //===----------------------------------------------------------------------===// -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <1, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <2, "tbuffer_load_format_xyz", []>; -def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Pseudo <"tbuffer_load_format_xyzw", VReg_128>; -def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>; -def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>; -def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>; -def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_128>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; } // End let SubtargetPredicate = isGCN @@ -1093,22 +1226,98 @@ // MTBUF Patterns //===----------------------------------------------------------------------===// -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; +//===----------------------------------------------------------------------===// +// tbuffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MTBUF_LoadIntrinsicPat { + def : Pat< + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + (!cast(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; +defm : MTBUF_LoadIntrinsicPat; + +multiclass MTBUF_StoreIntrinsicPat { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i8imm $dfmt), + (as_i8imm $nfmt), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, + imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + (!cast(opcode # _BOTHEN_exact) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; +defm : MTBUF_StoreIntrinsicPat; } // End let Predicates = [isGCN] @@ -1224,21 +1433,44 @@ class MTBUF_Real_si op, MTBUF_Pseudo ps> : MTBUF_Real, + Enc64, SIMCInstr { let AssemblerPredicate=isSICI; let DecoderNamespace="SICI"; - bits<1> addr64; - let Inst{15} = addr64; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{15} = ps.addr64; let Inst{18-16} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_si : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_si : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_si : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_si : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_si op> { + def _OFFSET_si : MTBUF_Real_si (NAME#"_OFFSET")>; + def _ADDR64_si : MTBUF_Real_si (NAME#"_ADDR64")>; + def _OFFEN_si : MTBUF_Real_si (NAME#"_OFFEN")>; + def _IDXEN_si : MTBUF_Real_si (NAME#"_IDXEN")>; + def _BOTHEN_si : MTBUF_Real_si (NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_si <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; //===----------------------------------------------------------------------===// // CI @@ -1350,16 +1582,39 @@ class MTBUF_Real_vi op, MTBUF_Pseudo ps> : MTBUF_Real, + Enc64, SIMCInstr { let AssemblerPredicate=isVI; let DecoderNamespace="VI"; + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -def TBUFFER_LOAD_FORMAT_XYZW_vi : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>; -def TBUFFER_STORE_FORMAT_X_vi : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>; -def TBUFFER_STORE_FORMAT_XY_vi : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>; -def TBUFFER_STORE_FORMAT_XYZ_vi : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>; -def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>; +multiclass MTBUF_Real_AllAddr_vi op> { + def _OFFSET_vi : MTBUF_Real_vi (NAME#"_OFFSET")>; + def _OFFEN_vi : MTBUF_Real_vi (NAME#"_OFFEN")>; + def _IDXEN_vi : MTBUF_Real_vi (NAME#"_IDXEN")>; + def _BOTHEN_vi : MTBUF_Real_vi (NAME#"_BOTHEN")>; +} +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; +//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -31,8 +31,6 @@ let VM_CNT = 1; let LGKM_CNT = 1; - let Uses = [EXEC, FLAT_SCR]; // M0 - let UseNamedOperandTable = 1; let hasSideEffects = 0; let SchedRW = [WriteVMEM]; @@ -40,10 +38,16 @@ string Mnemonic = opName; string AsmOperands = asmOps; + bits<1> is_flat_global = 0; + bits<1> is_flat_scratch = 0; + bits<1> has_vdst = 1; bits<1> has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + + // TODO: M0 if it could possibly access LDS (before gfx9? only)? + let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]); } class FLAT_Real op, FLAT_Pseudo ps> : @@ -68,7 +72,10 @@ // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? - bits<2> seg; // Segment, 00=flat, 01=scratch, 10=global, 11=reserved + + // Segment, 00=flat, 01=scratch, 10=global, 11=reserved + bits<2> seg = !if(ps.is_flat_global, 0b10, + !if(ps.is_flat_scratch, 0b01, 0)); // Signed offset. Highest bit ignored for flat and treated as 12-bit // unsigned for flat acceses. @@ -81,7 +88,7 @@ // Only valid on GFX9+ let Inst{12-0} = offset; let Inst{13} = lds; - let Inst{15-14} = 0; + let Inst{15-14} = seg; let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); let Inst{17} = slc; @@ -106,6 +113,16 @@ let mayLoad = 1; } +class FLAT_Global_Load_Pseudo : + FLAT_Load_Pseudo { + let is_flat_global = 1; +} + +class FLAT_Scratch_Load_Pseudo : + FLAT_Load_Pseudo { + let is_flat_scratch = 1; +} + class FLAT_Store_Pseudo : FLAT_Pseudo< opName, @@ -119,6 +136,16 @@ let has_vdst = 0; } +class FLAT_Global_Store_Pseudo : + FLAT_Store_Pseudo { + let is_flat_global = 1; +} + +class FLAT_Scratch_Store_Pseudo : + FLAT_Store_Pseudo { + let is_flat_scratch = 1; +} + multiclass FLAT_Atomic_Pseudo< string opName, RegisterClass vdst_rc, @@ -306,6 +333,26 @@ } // End SubtargetPredicate = isCI +let SubtargetPredicate = HasFlatGlobalInsts in { +def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; +def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; +def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; +def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; +def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; +def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; +def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; +def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; + +def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; +def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; +def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; +def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; +def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; +def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; + +} // End SubtargetPredicate = HasFlatGlobalInsts + + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -557,3 +604,18 @@ defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>; +def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>; +def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>; +def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>; +def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>; +def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>; +def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>; +def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>; +def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>; + +def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>; +def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>; +def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>; +def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>; +def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>; +def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>; Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -42,6 +42,7 @@ void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, @@ -52,6 +53,9 @@ void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -84,7 +88,11 @@ const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - + void printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -72,6 +72,11 @@ O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } +void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(static_cast(MI->getOperand(OpNo).getImm())); +} + void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -118,6 +123,16 @@ } } +void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << ((OpNo == 0)? "offset:" : " offset:"); + printS16ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -216,6 +231,24 @@ O << " vm"; } +void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " nfmt:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI) { switch (RegNo) { Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -174,6 +174,31 @@ return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } +static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + auto &Src = MI.getOperand(1); + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = Src.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + + for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) { + const auto *UseMI = MO.getParent(); + if (UseMI == &MI) + continue; + if (MO.isDef() || UseMI->getParent() != MI.getParent() || + UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END || + !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src)) + return false; + } + // Change VGPR to SGPR destination. + MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg))); + return true; +} + // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. // // SGPRx = ... @@ -214,6 +239,9 @@ if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) return false; + if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII)) + return true; + // TODO: Could have multiple extracts? unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) @@ -563,6 +591,8 @@ break; } TII->moveToVALU(MI); + } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } break; Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -13,6 +13,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -166,6 +167,8 @@ if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && TargetRegisterInfo::isVirtualRegister(New->getReg())) { Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + + Old.setIsUndef(New->isUndef()); return true; } @@ -470,7 +473,7 @@ return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); - if (Def->isMoveImmediate()) { + if (Def && Def->isMoveImmediate()) { MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) return &ImmSrc; @@ -921,12 +924,9 @@ // level. bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3288,6 +3288,8 @@ SelectionDAG &DAG) const { unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: { @@ -3313,7 +3315,6 @@ Op.getOperand(5), // glc Op.getOperand(6) // slc }; - MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? @@ -3328,6 +3329,29 @@ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); } + case Intrinsic::amdgcn_tbuffer_load: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Op.getOperand(4), // voffset + Op.getOperand(5), // soffset + Op.getOperand(6), // offset + Op.getOperand(7), // dfmt + Op.getOperand(8), // nfmt + Op.getOperand(9), // glc + Op.getOperand(10) // slc + }; + + EVT VT = Op.getOperand(2).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad, + VT.getStoreSize(), VT.getStoreSize()); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -3393,11 +3417,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); - + MachineFunction &MF = DAG.getMachineFunction(); + switch (IntrinsicID) { case Intrinsic::amdgcn_exp: { const ConstantSDNode *Tgt = cast(Op.getOperand(2)); @@ -3463,33 +3487,6 @@ return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, Op.getOperand(2), Op.getOperand(3)); } - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } case AMDGPUIntrinsic::AMDGPU_kill: { SDValue Src = Op.getOperand(2); if (const ConstantFPSDNode *K = dyn_cast(Src)) { @@ -3505,7 +3502,6 @@ } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { - const MachineFunction &MF = DAG.getMachineFunction(); const SISubtarget &ST = MF.getSubtarget(); unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) @@ -3514,6 +3510,76 @@ } return SDValue(); }; + case AMDGPUIntrinsic::SI_tbuffer_store: { + + // Extract vindex and voffset from vaddr as appropriate + const ConstantSDNode *OffEn = cast(Op.getOperand(10)); + const ConstantSDNode *IdxEn = cast(Op.getOperand(11)); + SDValue VAddr = Op.getOperand(5); + + SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); + + assert(!(OffEn->isOne() && IdxEn->isOne()) && + "Legacy intrinsic doesn't support both offset and index - use new version"); + + SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; + SDValue VOffset = OffEn->isOne() ? VAddr : Zero; + + // Deal with the vec-3 case + const ConstantSDNode *NumChannels = cast(Op.getOperand(4)); + auto Opcode = NumChannels->getZExtValue() == 3 ? + AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; + + SDValue Ops[] = { + Chain, + Op.getOperand(3), // vdata + Op.getOperand(2), // rsrc + VIndex, + VOffset, + Op.getOperand(6), // soffset + Op.getOperand(7), // inst_offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(12), // glc + Op.getOperand(13), // slc + }; + + const ConstantSDNode *tfe = cast(Op.getOperand(14)); + assert(tfe->getZExtValue() == 0 && + "Value of tfe other than zero is unsupported"); + + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(Opcode, DL, + Op->getVTList(), Ops, VT, MMO); + } + + case Intrinsic::amdgcn_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + Op.getOperand(8), // dfmt + Op.getOperand(9), // nfmt + Op.getOperand(10), // glc + Op.getOperand(11) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: return Op; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -39,25 +39,41 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", - SDTypeProfile<0, 13, - [SDTCisVT<0, v4i32>, // rsrc(SGPR) - SDTCisVT<1, iAny>, // vdata(VGPR) - SDTCisVT<2, i32>, // num_channels(imm) - SDTCisVT<3, i32>, // vaddr(VGPR) +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", + SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // dfmt(imm) SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // offen(imm) - SDTCisVT<9, i32>, // idxen(imm) - SDTCisVT<10, i32>, // glc(imm) - SDTCisVT<11, i32>, // slc(imm) - SDTCisVT<12, i32> // tfe(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) ]>, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain] + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SDTtbuffer_store : SDTypeProfile<0, 10, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; + def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -525,7 +541,7 @@ def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>; -def offset_s13 : NamedOperandS13<"Offset", NamedMatchClass<"OffsetS13">>; +def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>; def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; @@ -545,6 +561,9 @@ def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; +def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; +def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; + def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -92,6 +92,8 @@ case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: + if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm()) + return false; // Additional verification is needed for sdst/src2. return true; Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -388,7 +388,7 @@ // FIXME: We probably don't need to run these for -fPIE. if (getPPCTargetMachine().isPositionIndependent()) { // FIXME: LiveVariables should not be necessary here! - // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on + // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on // LiveVariables. This (unnecessary) dependency has been removed now, // however a stage-2 clang build fails without LiveVariables computed here. addPass(&LiveVariablesID, false); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1662,6 +1662,12 @@ MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; + + // TODO: These control memcmp expansion in CGP and are set low to prevent + // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. + MaxLoadsPerMemcmp = 1; + MaxLoadsPerMemcmpOptSize = 1; + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); @@ -14272,7 +14278,8 @@ // If we are inserting a element, see if we can do this more efficiently with // a blend shuffle with a rematerializable vector than a costly integer // insertion. - if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) { + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && + 16 <= EltVT.getSizeInBits()) { SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -107,7 +107,7 @@ bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2232,6 +2232,12 @@ return (CallerBits & CalleeBits) == CalleeBits; } +bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + // TODO: We can increase these based on available vector ops. + MaxLoadSize = ST->is64Bit() ? 8 : 4; + return true; +} + bool X86TTIImpl::enableInterleavedAccessVectorization() { // TODO: We expect this to be beneficial regardless of arch, // but there are currently some unexplained performance artifacts on Atom. Index: lib/ToolDrivers/llvm-lib/LibDriver.cpp =================================================================== --- lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -31,7 +31,7 @@ enum { OPT_INVALID = 0, -#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID, +#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID, #include "Options.inc" #undef OPTION }; @@ -41,11 +41,9 @@ #undef PREFIX static const llvm::opt::OptTable::Info infoTable[] = { -#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10) \ - { \ - X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \ - OPT_##GROUP, OPT_##ALIAS, X6 \ - }, +#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10, X11) \ + {X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, \ + X8, X7, OPT_##GROUP, OPT_##ALIAS, X6, X11}, #include "Options.inc" #undef OPTION }; Index: lib/Transforms/Scalar/NewGVN.cpp =================================================================== --- lib/Transforms/Scalar/NewGVN.cpp +++ lib/Transforms/Scalar/NewGVN.cpp @@ -3025,12 +3025,10 @@ // It's okay to have the same expression already in there if it is // identical in nature. // This can happen when the leader of the stored value changes over time. - if (!Okay) { - Okay = Okay && std::get<1>(Res.first->second) == KV.second; - Okay = Okay && - lookupOperandLeader(std::get<2>(Res.first->second)) == - lookupOperandLeader(SE->getStoredValue()); - } + if (!Okay) + Okay = (std::get<1>(Res.first->second) == KV.second) && + (lookupOperandLeader(std::get<2>(Res.first->second)) == + lookupOperandLeader(SE->getStoredValue())); assert(Okay && "Stored expression conflict exists in expression table"); auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst()); assert(ValueExpr && ValueExpr->equals(*SE) && Index: test/Analysis/CostModel/X86/arith.ll =================================================================== --- test/Analysis/CostModel/X86/arith.ll +++ test/Analysis/CostModel/X86/arith.ll @@ -1,516 +1,564 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3 -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSSE3 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; CHECK-LABEL: 'add' define i32 @add(i32 %arg) { - ; SSSE3: cost of 1 {{.*}} %A = add - ; SSE42: cost of 1 {{.*}} %A = add - ; AVX: cost of 1 {{.*}} %A = add - ; AVX2: cost of 1 {{.*}} %A = add - ; AVX512: cost of 1 {{.*}} %A = add - %A = add <2 x i64> undef, undef - ; SSSE3: cost of 2 {{.*}} %B = add - ; SSE42: cost of 2 {{.*}} %B = add - ; AVX: cost of 4 {{.*}} %B = add - ; AVX2: cost of 1 {{.*}} %B = add - ; AVX512: cost of 1 {{.*}} %B = add - %B = add <4 x i64> undef, undef - ; SSSE3: cost of 4 {{.*}} %C = add - ; SSE42: cost of 4 {{.*}} %C = add - ; AVX: cost of 8 {{.*}} %C = add - ; AVX2: cost of 2 {{.*}} %C = add - ; AVX512: cost of 1 {{.*}} %C = add - %C = add <8 x i64> undef, undef - - ; SSSE3: cost of 1 {{.*}} %D = add - ; SSE42: cost of 1 {{.*}} %D = add - ; AVX: cost of 1 {{.*}} %D = add - ; AVX2: cost of 1 {{.*}} %D = add - ; AVX512: cost of 1 {{.*}} %D = add - %D = add <4 x i32> undef, undef - ; SSSE3: cost of 2 {{.*}} %E = add - ; SSE42: cost of 2 {{.*}} %E = add - ; AVX: cost of 4 {{.*}} %E = add - ; AVX2: cost of 1 {{.*}} %E = add - ; AVX512: cost of 1 {{.*}} %E = add - %E = add <8 x i32> undef, undef - ; SSSE3: cost of 4 {{.*}} %F = add - ; SSE42: cost of 4 {{.*}} %F = add - ; AVX: cost of 8 {{.*}} %F = add - ; AVX2: cost of 2 {{.*}} %F = add - ; AVX512: cost of 1 {{.*}} %F = add - %F = add <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = add - ; SSE42: cost of 1 {{.*}} %G = add - ; AVX: cost of 1 {{.*}} %G = add - ; AVX2: cost of 1 {{.*}} %G = add - ; AVX512: cost of 1 {{.*}} %G = add - %G = add <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = add - ; SSE42: cost of 2 {{.*}} %H = add - ; AVX: cost of 4 {{.*}} %H = add - ; AVX2: cost of 1 {{.*}} %H = add - ; AVX512: cost of 1 {{.*}} %H = add - %H = add <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = add - ; SSE42: cost of 4 {{.*}} %I = add - ; AVX: cost of 8 {{.*}} %I = add - ; AVX2: cost of 2 {{.*}} %I = add - ; AVX512F: cost of 2 {{.*}} %I = add - ; AVX512BW: cost of 1 {{.*}} %I = add - %I = add <32 x i16> undef, undef - - ; SSSE3: cost of 1 {{.*}} %J = add - ; SSE42: cost of 1 {{.*}} %J = add - ; AVX: cost of 1 {{.*}} %J = add - ; AVX2: cost of 1 {{.*}} %J = add - ; AVX512: cost of 1 {{.*}} %J = add - %J = add <16 x i8> undef, undef - ; SSSE3: cost of 2 {{.*}} %K = add - ; SSE42: cost of 2 {{.*}} %K = add - ; AVX: cost of 4 {{.*}} %K = add - ; AVX2: cost of 1 {{.*}} %K = add - ; AVX512: cost of 1 {{.*}} %K = add - %K = add <32 x i8> undef, undef - ; SSSE3: cost of 4 {{.*}} %L = add - ; SSE42: cost of 4 {{.*}} %L = add - ; AVX: cost of 8 {{.*}} %L = add - ; AVX2: cost of 2 {{.*}} %L = add - ; AVX512F: cost of 2 {{.*}} %L = add - ; AVX512BW: cost of 1 {{.*}} %L = add - %L = add <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = add + %I64 = add i64 undef, undef + ; SSSE3: cost of 1 {{.*}} %V2I64 = add + ; SSE42: cost of 1 {{.*}} %V2I64 = add + ; AVX: cost of 1 {{.*}} %V2I64 = add + ; AVX2: cost of 1 {{.*}} %V2I64 = add + ; AVX512: cost of 1 {{.*}} %V2I64 = add + %V2I64 = add <2 x i64> undef, undef + ; SSSE3: cost of 2 {{.*}} %V4I64 = add + ; SSE42: cost of 2 {{.*}} %V4I64 = add + ; AVX: cost of 4 {{.*}} %V4I64 = add + ; AVX2: cost of 1 {{.*}} %V4I64 = add + ; AVX512: cost of 1 {{.*}} %V4I64 = add + %V4I64 = add <4 x i64> undef, undef + ; SSSE3: cost of 4 {{.*}} %V8I64 = add + ; SSE42: cost of 4 {{.*}} %V8I64 = add + ; AVX: cost of 8 {{.*}} %V8I64 = add + ; AVX2: cost of 2 {{.*}} %V8I64 = add + ; AVX512: cost of 1 {{.*}} %V8I64 = add + %V8I64 = add <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = add + %I32 = add i32 undef, undef + ; SSSE3: cost of 1 {{.*}} %V4I32 = add + ; SSE42: cost of 1 {{.*}} %V4I32 = add + ; AVX: cost of 1 {{.*}} %V4I32 = add + ; AVX2: cost of 1 {{.*}} %V4I32 = add + ; AVX512: cost of 1 {{.*}} %V4I32 = add + %V4I32 = add <4 x i32> undef, undef + ; SSSE3: cost of 2 {{.*}} %V8I32 = add + ; SSE42: cost of 2 {{.*}} %V8I32 = add + ; AVX: cost of 4 {{.*}} %V8I32 = add + ; AVX2: cost of 1 {{.*}} %V8I32 = add + ; AVX512: cost of 1 {{.*}} %V8I32 = add + %V8I32 = add <8 x i32> undef, undef + ; SSSE3: cost of 4 {{.*}} %V16I32 = add + ; SSE42: cost of 4 {{.*}} %V16I32 = add + ; AVX: cost of 8 {{.*}} %V16I32 = add + ; AVX2: cost of 2 {{.*}} %V16I32 = add + ; AVX512: cost of 1 {{.*}} %V16I32 = add + %V16I32 = add <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = add + %I16 = add i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = add + ; SSE42: cost of 1 {{.*}} %V8I16 = add + ; AVX: cost of 1 {{.*}} %V8I16 = add + ; AVX2: cost of 1 {{.*}} %V8I16 = add + ; AVX512: cost of 1 {{.*}} %V8I16 = add + %V8I16 = add <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = add + ; SSE42: cost of 2 {{.*}} %V16I16 = add + ; AVX: cost of 4 {{.*}} %V16I16 = add + ; AVX2: cost of 1 {{.*}} %V16I16 = add + ; AVX512: cost of 1 {{.*}} %V16I16 = add + %V16I16 = add <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = add + ; SSE42: cost of 4 {{.*}} %V32I16 = add + ; AVX: cost of 8 {{.*}} %V32I16 = add + ; AVX2: cost of 2 {{.*}} %V32I16 = add + ; AVX512F: cost of 2 {{.*}} %V32I16 = add + ; AVX512BW: cost of 1 {{.*}} %V32I16 = add + %V32I16 = add <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = add + %I8 = add i8 undef, undef + ; SSSE3: cost of 1 {{.*}} %V16I8 = add + ; SSE42: cost of 1 {{.*}} %V16I8 = add + ; AVX: cost of 1 {{.*}} %V16I8 = add + ; AVX2: cost of 1 {{.*}} %V16I8 = add + ; AVX512: cost of 1 {{.*}} %V16I8 = add + %V16I8 = add <16 x i8> undef, undef + ; SSSE3: cost of 2 {{.*}} %V32I8 = add + ; SSE42: cost of 2 {{.*}} %V32I8 = add + ; AVX: cost of 4 {{.*}} %V32I8 = add + ; AVX2: cost of 1 {{.*}} %V32I8 = add + ; AVX512: cost of 1 {{.*}} %V32I8 = add + %V32I8 = add <32 x i8> undef, undef + ; SSSE3: cost of 4 {{.*}} %V64I8 = add + ; SSE42: cost of 4 {{.*}} %V64I8 = add + ; AVX: cost of 8 {{.*}} %V64I8 = add + ; AVX2: cost of 2 {{.*}} %V64I8 = add + ; AVX512F: cost of 2 {{.*}} %V64I8 = add + ; AVX512BW: cost of 1 {{.*}} %V64I8 = add + %V64I8 = add <64 x i8> undef, undef ret i32 undef } ; CHECK-LABEL: 'sub' define i32 @sub(i32 %arg) { - ; SSSE3: cost of 1 {{.*}} %A = sub - ; SSE42: cost of 1 {{.*}} %A = sub - ; AVX: cost of 1 {{.*}} %A = sub - ; AVX2: cost of 1 {{.*}} %A = sub - ; AVX512: cost of 1 {{.*}} %A = sub - %A = sub <2 x i64> undef, undef - ; SSSE3: cost of 2 {{.*}} %B = sub - ; SSE42: cost of 2 {{.*}} %B = sub - ; AVX: cost of 4 {{.*}} %B = sub - ; AVX2: cost of 1 {{.*}} %B = sub - ; AVX512: cost of 1 {{.*}} %B = sub - %B = sub <4 x i64> undef, undef - ; SSSE3: cost of 4 {{.*}} %C = sub - ; SSE42: cost of 4 {{.*}} %C = sub - ; AVX: cost of 8 {{.*}} %C = sub - ; AVX2: cost of 2 {{.*}} %C = sub - ; AVX512: cost of 1 {{.*}} %C = sub - %C = sub <8 x i64> undef, undef - - ; SSSE3: cost of 1 {{.*}} %D = sub - ; SSE42: cost of 1 {{.*}} %D = sub - ; AVX: cost of 1 {{.*}} %D = sub - ; AVX2: cost of 1 {{.*}} %D = sub - ; AVX512: cost of 1 {{.*}} %D = sub - %D = sub <4 x i32> undef, undef - ; SSSE3: cost of 2 {{.*}} %E = sub - ; SSE42: cost of 2 {{.*}} %E = sub - ; AVX: cost of 4 {{.*}} %E = sub - ; AVX2: cost of 1 {{.*}} %E = sub - ; AVX512: cost of 1 {{.*}} %E = sub - %E = sub <8 x i32> undef, undef - ; SSSE3: cost of 4 {{.*}} %F = sub - ; SSE42: cost of 4 {{.*}} %F = sub - ; AVX: cost of 8 {{.*}} %F = sub - ; AVX2: cost of 2 {{.*}} %F = sub - ; AVX512: cost of 1 {{.*}} %F = sub - %F = sub <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = sub - ; SSE42: cost of 1 {{.*}} %G = sub - ; AVX: cost of 1 {{.*}} %G = sub - ; AVX2: cost of 1 {{.*}} %G = sub - ; AVX512: cost of 1 {{.*}} %G = sub - %G = sub <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = sub - ; SSE42: cost of 2 {{.*}} %H = sub - ; AVX: cost of 4 {{.*}} %H = sub - ; AVX2: cost of 1 {{.*}} %H = sub - ; AVX512: cost of 1 {{.*}} %H = sub - %H = sub <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = sub - ; SSE42: cost of 4 {{.*}} %I = sub - ; AVX: cost of 8 {{.*}} %I = sub - ; AVX2: cost of 2 {{.*}} %I = sub - ; AVX512F: cost of 2 {{.*}} %I = sub - ; AVX512BW: cost of 1 {{.*}} %I = sub - %I = sub <32 x i16> undef, undef - - ; SSSE3: cost of 1 {{.*}} %J = sub - ; SSE42: cost of 1 {{.*}} %J = sub - ; AVX: cost of 1 {{.*}} %J = sub - ; AVX2: cost of 1 {{.*}} %J = sub - ; AVX512: cost of 1 {{.*}} %J = sub - %J = sub <16 x i8> undef, undef - ; SSSE3: cost of 2 {{.*}} %K = sub - ; SSE42: cost of 2 {{.*}} %K = sub - ; AVX: cost of 4 {{.*}} %K = sub - ; AVX2: cost of 1 {{.*}} %K = sub - ; AVX512: cost of 1 {{.*}} %K = sub - %K = sub <32 x i8> undef, undef - ; SSSE3: cost of 4 {{.*}} %L = sub - ; SSE42: cost of 4 {{.*}} %L = sub - ; AVX: cost of 8 {{.*}} %L = sub - ; AVX2: cost of 2 {{.*}} %L = sub - ; AVX512F: cost of 2 {{.*}} %L = sub - ; AVX512BW: cost of 1 {{.*}} %L = sub - %L = sub <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = sub + %I64 = sub i64 undef, undef + ; SSSE3: cost of 1 {{.*}} %V2I64 = sub + ; SSE42: cost of 1 {{.*}} %V2I64 = sub + ; AVX: cost of 1 {{.*}} %V2I64 = sub + ; AVX2: cost of 1 {{.*}} %V2I64 = sub + ; AVX512: cost of 1 {{.*}} %V2I64 = sub + %V2I64 = sub <2 x i64> undef, undef + ; SSSE3: cost of 2 {{.*}} %V4I64 = sub + ; SSE42: cost of 2 {{.*}} %V4I64 = sub + ; AVX: cost of 4 {{.*}} %V4I64 = sub + ; AVX2: cost of 1 {{.*}} %V4I64 = sub + ; AVX512: cost of 1 {{.*}} %V4I64 = sub + %V4I64 = sub <4 x i64> undef, undef + ; SSSE3: cost of 4 {{.*}} %V8I64 = sub + ; SSE42: cost of 4 {{.*}} %V8I64 = sub + ; AVX: cost of 8 {{.*}} %V8I64 = sub + ; AVX2: cost of 2 {{.*}} %V8I64 = sub + ; AVX512: cost of 1 {{.*}} %V8I64 = sub + %V8I64 = sub <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = sub + %I32 = sub i32 undef, undef + ; SSSE3: cost of 1 {{.*}} %V4I32 = sub + ; SSE42: cost of 1 {{.*}} %V4I32 = sub + ; AVX: cost of 1 {{.*}} %V4I32 = sub + ; AVX2: cost of 1 {{.*}} %V4I32 = sub + ; AVX512: cost of 1 {{.*}} %V4I32 = sub + %V4I32 = sub <4 x i32> undef, undef + ; SSSE3: cost of 2 {{.*}} %V8I32 = sub + ; SSE42: cost of 2 {{.*}} %V8I32 = sub + ; AVX: cost of 4 {{.*}} %V8I32 = sub + ; AVX2: cost of 1 {{.*}} %V8I32 = sub + ; AVX512: cost of 1 {{.*}} %V8I32 = sub + %V8I32 = sub <8 x i32> undef, undef + ; SSSE3: cost of 4 {{.*}} %V16I32 = sub + ; SSE42: cost of 4 {{.*}} %V16I32 = sub + ; AVX: cost of 8 {{.*}} %V16I32 = sub + ; AVX2: cost of 2 {{.*}} %V16I32 = sub + ; AVX512: cost of 1 {{.*}} %V16I32 = sub + %V16I32 = sub <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = sub + %I16 = sub i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = sub + ; SSE42: cost of 1 {{.*}} %V8I16 = sub + ; AVX: cost of 1 {{.*}} %V8I16 = sub + ; AVX2: cost of 1 {{.*}} %V8I16 = sub + ; AVX512: cost of 1 {{.*}} %V8I16 = sub + %V8I16 = sub <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = sub + ; SSE42: cost of 2 {{.*}} %V16I16 = sub + ; AVX: cost of 4 {{.*}} %V16I16 = sub + ; AVX2: cost of 1 {{.*}} %V16I16 = sub + ; AVX512: cost of 1 {{.*}} %V16I16 = sub + %V16I16 = sub <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = sub + ; SSE42: cost of 4 {{.*}} %V32I16 = sub + ; AVX: cost of 8 {{.*}} %V32I16 = sub + ; AVX2: cost of 2 {{.*}} %V32I16 = sub + ; AVX512F: cost of 2 {{.*}} %V32I16 = sub + ; AVX512BW: cost of 1 {{.*}} %V32I16 = sub + %V32I16 = sub <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = sub + %I8 = sub i8 undef, undef + ; SSSE3: cost of 1 {{.*}} %V16I8 = sub + ; SSE42: cost of 1 {{.*}} %V16I8 = sub + ; AVX: cost of 1 {{.*}} %V16I8 = sub + ; AVX2: cost of 1 {{.*}} %V16I8 = sub + ; AVX512: cost of 1 {{.*}} %V16I8 = sub + %V16I8 = sub <16 x i8> undef, undef + ; SSSE3: cost of 2 {{.*}} %V32I8 = sub + ; SSE42: cost of 2 {{.*}} %V32I8 = sub + ; AVX: cost of 4 {{.*}} %V32I8 = sub + ; AVX2: cost of 1 {{.*}} %V32I8 = sub + ; AVX512: cost of 1 {{.*}} %V32I8 = sub + %V32I8 = sub <32 x i8> undef, undef + ; SSSE3: cost of 4 {{.*}} %V64I8 = sub + ; SSE42: cost of 4 {{.*}} %V64I8 = sub + ; AVX: cost of 8 {{.*}} %V64I8 = sub + ; AVX2: cost of 2 {{.*}} %V64I8 = sub + ; AVX512F: cost of 2 {{.*}} %V64I8 = sub + ; AVX512BW: cost of 1 {{.*}} %V64I8 = sub + %V64I8 = sub <64 x i8> undef, undef ret i32 undef } ; CHECK-LABEL: 'or' define i32 @or(i32 %arg) { - ; SSSE3: cost of 1 {{.*}} %A = or - ; SSE42: cost of 1 {{.*}} %A = or - ; AVX: cost of 1 {{.*}} %A = or - ; AVX2: cost of 1 {{.*}} %A = or - ; AVX512: cost of 1 {{.*}} %A = or - %A = or <2 x i64> undef, undef - ; SSSE3: cost of 2 {{.*}} %B = or - ; SSE42: cost of 2 {{.*}} %B = or - ; AVX: cost of 1 {{.*}} %B = or - ; AVX2: cost of 1 {{.*}} %B = or - ; AVX512: cost of 1 {{.*}} %B = or - %B = or <4 x i64> undef, undef - ; SSSE3: cost of 4 {{.*}} %C = or - ; SSE42: cost of 4 {{.*}} %C = or - ; AVX: cost of 2 {{.*}} %C = or - ; AVX2: cost of 2 {{.*}} %C = or - ; AVX512: cost of 1 {{.*}} %C = or - %C = or <8 x i64> undef, undef - - ; SSSE3: cost of 1 {{.*}} %D = or - ; SSE42: cost of 1 {{.*}} %D = or - ; AVX: cost of 1 {{.*}} %D = or - ; AVX2: cost of 1 {{.*}} %D = or - ; AVX512: cost of 1 {{.*}} %D = or - %D = or <4 x i32> undef, undef - ; SSSE3: cost of 2 {{.*}} %E = or - ; SSE42: cost of 2 {{.*}} %E = or - ; AVX: cost of 1 {{.*}} %E = or - ; AVX2: cost of 1 {{.*}} %E = or - ; AVX512: cost of 1 {{.*}} %E = or - %E = or <8 x i32> undef, undef - ; SSSE3: cost of 4 {{.*}} %F = or - ; SSE42: cost of 4 {{.*}} %F = or - ; AVX: cost of 2 {{.*}} %F = or - ; AVX2: cost of 2 {{.*}} %F = or - ; AVX512: cost of 1 {{.*}} %F = or - %F = or <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = or - ; SSE42: cost of 1 {{.*}} %G = or - ; AVX: cost of 1 {{.*}} %G = or - ; AVX2: cost of 1 {{.*}} %G = or - ; AVX512: cost of 1 {{.*}} %G = or - %G = or <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = or - ; SSE42: cost of 2 {{.*}} %H = or - ; AVX: cost of 1 {{.*}} %H = or - ; AVX2: cost of 1 {{.*}} %H = or - ; AVX512: cost of 1 {{.*}} %H = or - %H = or <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = or - ; SSE42: cost of 4 {{.*}} %I = or - ; AVX: cost of 2 {{.*}} %I = or - ; AVX2: cost of 2 {{.*}} %I = or - ; AVX512F: cost of 2 {{.*}} %I = or - ; AVX512BW: cost of 1 {{.*}} %I = or - %I = or <32 x i16> undef, undef - - ; SSSE3: cost of 1 {{.*}} %J = or - ; SSE42: cost of 1 {{.*}} %J = or - ; AVX: cost of 1 {{.*}} %J = or - ; AVX2: cost of 1 {{.*}} %J = or - ; AVX512: cost of 1 {{.*}} %J = or - %J = or <16 x i8> undef, undef - ; SSSE3: cost of 2 {{.*}} %K = or - ; SSE42: cost of 2 {{.*}} %K = or - ; AVX: cost of 1 {{.*}} %K = or - ; AVX2: cost of 1 {{.*}} %K = or - ; AVX512: cost of 1 {{.*}} %K = or - %K = or <32 x i8> undef, undef - ; SSSE3: cost of 4 {{.*}} %L = or - ; SSE42: cost of 4 {{.*}} %L = or - ; AVX: cost of 2 {{.*}} %L = or - ; AVX2: cost of 2 {{.*}} %L = or - ; AVX512F: cost of 2 {{.*}} %L = or - ; AVX512BW: cost of 1 {{.*}} %L = or - %L = or <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = or + %I64 = or i64 undef, undef + ; SSSE3: cost of 1 {{.*}} %V2I64 = or + ; SSE42: cost of 1 {{.*}} %V2I64 = or + ; AVX: cost of 1 {{.*}} %V2I64 = or + ; AVX2: cost of 1 {{.*}} %V2I64 = or + ; AVX512: cost of 1 {{.*}} %V2I64 = or + %V2I64 = or <2 x i64> undef, undef + ; SSSE3: cost of 2 {{.*}} %V4I64 = or + ; SSE42: cost of 2 {{.*}} %V4I64 = or + ; AVX: cost of 1 {{.*}} %V4I64 = or + ; AVX2: cost of 1 {{.*}} %V4I64 = or + ; AVX512: cost of 1 {{.*}} %V4I64 = or + %V4I64 = or <4 x i64> undef, undef + ; SSSE3: cost of 4 {{.*}} %V8I64 = or + ; SSE42: cost of 4 {{.*}} %V8I64 = or + ; AVX: cost of 2 {{.*}} %V8I64 = or + ; AVX2: cost of 2 {{.*}} %V8I64 = or + ; AVX512: cost of 1 {{.*}} %V8I64 = or + %V8I64 = or <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = or + %I32 = or i32 undef, undef + ; SSSE3: cost of 1 {{.*}} %V4I32 = or + ; SSE42: cost of 1 {{.*}} %V4I32 = or + ; AVX: cost of 1 {{.*}} %V4I32 = or + ; AVX2: cost of 1 {{.*}} %V4I32 = or + ; AVX512: cost of 1 {{.*}} %V4I32 = or + %V4I32 = or <4 x i32> undef, undef + ; SSSE3: cost of 2 {{.*}} %V8I32 = or + ; SSE42: cost of 2 {{.*}} %V8I32 = or + ; AVX: cost of 1 {{.*}} %V8I32 = or + ; AVX2: cost of 1 {{.*}} %V8I32 = or + ; AVX512: cost of 1 {{.*}} %V8I32 = or + %V8I32 = or <8 x i32> undef, undef + ; SSSE3: cost of 4 {{.*}} %V16I32 = or + ; SSE42: cost of 4 {{.*}} %V16I32 = or + ; AVX: cost of 2 {{.*}} %V16I32 = or + ; AVX2: cost of 2 {{.*}} %V16I32 = or + ; AVX512: cost of 1 {{.*}} %V16I32 = or + %V16I32 = or <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = or + %I16 = or i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = or + ; SSE42: cost of 1 {{.*}} %V8I16 = or + ; AVX: cost of 1 {{.*}} %V8I16 = or + ; AVX2: cost of 1 {{.*}} %V8I16 = or + ; AVX512: cost of 1 {{.*}} %V8I16 = or + %V8I16 = or <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = or + ; SSE42: cost of 2 {{.*}} %V16I16 = or + ; AVX: cost of 1 {{.*}} %V16I16 = or + ; AVX2: cost of 1 {{.*}} %V16I16 = or + ; AVX512: cost of 1 {{.*}} %V16I16 = or + %V16I16 = or <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = or + ; SSE42: cost of 4 {{.*}} %V32I16 = or + ; AVX: cost of 2 {{.*}} %V32I16 = or + ; AVX2: cost of 2 {{.*}} %V32I16 = or + ; AVX512F: cost of 2 {{.*}} %V32I16 = or + ; AVX512BW: cost of 1 {{.*}} %V32I16 = or + %V32I16 = or <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = or + %I8 = or i8 undef, undef + ; SSSE3: cost of 1 {{.*}} %V16I8 = or + ; SSE42: cost of 1 {{.*}} %V16I8 = or + ; AVX: cost of 1 {{.*}} %V16I8 = or + ; AVX2: cost of 1 {{.*}} %V16I8 = or + ; AVX512: cost of 1 {{.*}} %V16I8 = or + %V16I8 = or <16 x i8> undef, undef + ; SSSE3: cost of 2 {{.*}} %V32I8 = or + ; SSE42: cost of 2 {{.*}} %V32I8 = or + ; AVX: cost of 1 {{.*}} %V32I8 = or + ; AVX2: cost of 1 {{.*}} %V32I8 = or + ; AVX512: cost of 1 {{.*}} %V32I8 = or + %V32I8 = or <32 x i8> undef, undef + ; SSSE3: cost of 4 {{.*}} %V64I8 = or + ; SSE42: cost of 4 {{.*}} %V64I8 = or + ; AVX: cost of 2 {{.*}} %V64I8 = or + ; AVX2: cost of 2 {{.*}} %V64I8 = or + ; AVX512F: cost of 2 {{.*}} %V64I8 = or + ; AVX512BW: cost of 1 {{.*}} %V64I8 = or + %V64I8 = or <64 x i8> undef, undef ret i32 undef } ; CHECK-LABEL: 'xor' define i32 @xor(i32 %arg) { - ; SSSE3: cost of 1 {{.*}} %A = xor - ; SSE42: cost of 1 {{.*}} %A = xor - ; AVX: cost of 1 {{.*}} %A = xor - ; AVX2: cost of 1 {{.*}} %A = xor - ; AVX512: cost of 1 {{.*}} %A = xor - %A = xor <2 x i64> undef, undef - ; SSSE3: cost of 2 {{.*}} %B = xor - ; SSE42: cost of 2 {{.*}} %B = xor - ; AVX: cost of 1 {{.*}} %B = xor - ; AVX2: cost of 1 {{.*}} %B = xor - ; AVX512: cost of 1 {{.*}} %B = xor - %B = xor <4 x i64> undef, undef - ; SSSE3: cost of 4 {{.*}} %C = xor - ; SSE42: cost of 4 {{.*}} %C = xor - ; AVX: cost of 2 {{.*}} %C = xor - ; AVX2: cost of 2 {{.*}} %C = xor - ; AVX512: cost of 1 {{.*}} %C = xor - %C = xor <8 x i64> undef, undef - - ; SSSE3: cost of 1 {{.*}} %D = xor - ; SSE42: cost of 1 {{.*}} %D = xor - ; AVX: cost of 1 {{.*}} %D = xor - ; AVX2: cost of 1 {{.*}} %D = xor - ; AVX512: cost of 1 {{.*}} %D = xor - %D = xor <4 x i32> undef, undef - ; SSSE3: cost of 2 {{.*}} %E = xor - ; SSE42: cost of 2 {{.*}} %E = xor - ; AVX: cost of 1 {{.*}} %E = xor - ; AVX2: cost of 1 {{.*}} %E = xor - ; AVX512: cost of 1 {{.*}} %E = xor - %E = xor <8 x i32> undef, undef - ; SSSE3: cost of 4 {{.*}} %F = xor - ; SSE42: cost of 4 {{.*}} %F = xor - ; AVX: cost of 2 {{.*}} %F = xor - ; AVX2: cost of 2 {{.*}} %F = xor - ; AVX512: cost of 1 {{.*}} %F = xor - %F = xor <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = xor - ; SSE42: cost of 1 {{.*}} %G = xor - ; AVX: cost of 1 {{.*}} %G = xor - ; AVX2: cost of 1 {{.*}} %G = xor - ; AVX512: cost of 1 {{.*}} %G = xor - %G = xor <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = xor - ; SSE42: cost of 2 {{.*}} %H = xor - ; AVX: cost of 1 {{.*}} %H = xor - ; AVX2: cost of 1 {{.*}} %H = xor - ; AVX512: cost of 1 {{.*}} %H = xor - %H = xor <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = xor - ; SSE42: cost of 4 {{.*}} %I = xor - ; AVX: cost of 2 {{.*}} %I = xor - ; AVX2: cost of 2 {{.*}} %I = xor - ; AVX512F: cost of 2 {{.*}} %I = xor - ; AVX512BW: cost of 1 {{.*}} %I = xor - %I = xor <32 x i16> undef, undef - - ; SSSE3: cost of 1 {{.*}} %J = xor - ; SSE42: cost of 1 {{.*}} %J = xor - ; AVX: cost of 1 {{.*}} %J = xor - ; AVX2: cost of 1 {{.*}} %J = xor - ; AVX512: cost of 1 {{.*}} %J = xor - %J = xor <16 x i8> undef, undef - ; SSSE3: cost of 2 {{.*}} %K = xor - ; SSE42: cost of 2 {{.*}} %K = xor - ; AVX: cost of 1 {{.*}} %K = xor - ; AVX2: cost of 1 {{.*}} %K = xor - ; AVX512: cost of 1 {{.*}} %K = xor - %K = xor <32 x i8> undef, undef - ; SSSE3: cost of 4 {{.*}} %L = xor - ; SSE42: cost of 4 {{.*}} %L = xor - ; AVX: cost of 2 {{.*}} %L = xor - ; AVX2: cost of 2 {{.*}} %L = xor - ; AVX512F: cost of 2 {{.*}} %L = xor - ; AVX512BW: cost of 1 {{.*}} %L = xor - %L = xor <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = xor + %I64 = xor i64 undef, undef + ; SSSE3: cost of 1 {{.*}} %V2I64 = xor + ; SSE42: cost of 1 {{.*}} %V2I64 = xor + ; AVX: cost of 1 {{.*}} %V2I64 = xor + ; AVX2: cost of 1 {{.*}} %V2I64 = xor + ; AVX512: cost of 1 {{.*}} %V2I64 = xor + %V2I64 = xor <2 x i64> undef, undef + ; SSSE3: cost of 2 {{.*}} %V4I64 = xor + ; SSE42: cost of 2 {{.*}} %V4I64 = xor + ; AVX: cost of 1 {{.*}} %V4I64 = xor + ; AVX2: cost of 1 {{.*}} %V4I64 = xor + ; AVX512: cost of 1 {{.*}} %V4I64 = xor + %V4I64 = xor <4 x i64> undef, undef + ; SSSE3: cost of 4 {{.*}} %V8I64 = xor + ; SSE42: cost of 4 {{.*}} %V8I64 = xor + ; AVX: cost of 2 {{.*}} %V8I64 = xor + ; AVX2: cost of 2 {{.*}} %V8I64 = xor + ; AVX512: cost of 1 {{.*}} %V8I64 = xor + %V8I64 = xor <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = xor + %I32 = xor i32 undef, undef + ; SSSE3: cost of 1 {{.*}} %V4I32 = xor + ; SSE42: cost of 1 {{.*}} %V4I32 = xor + ; AVX: cost of 1 {{.*}} %V4I32 = xor + ; AVX2: cost of 1 {{.*}} %V4I32 = xor + ; AVX512: cost of 1 {{.*}} %V4I32 = xor + %V4I32 = xor <4 x i32> undef, undef + ; SSSE3: cost of 2 {{.*}} %V8I32 = xor + ; SSE42: cost of 2 {{.*}} %V8I32 = xor + ; AVX: cost of 1 {{.*}} %V8I32 = xor + ; AVX2: cost of 1 {{.*}} %V8I32 = xor + ; AVX512: cost of 1 {{.*}} %V8I32 = xor + %V8I32 = xor <8 x i32> undef, undef + ; SSSE3: cost of 4 {{.*}} %V16I32 = xor + ; SSE42: cost of 4 {{.*}} %V16I32 = xor + ; AVX: cost of 2 {{.*}} %V16I32 = xor + ; AVX2: cost of 2 {{.*}} %V16I32 = xor + ; AVX512: cost of 1 {{.*}} %V16I32 = xor + %V16I32 = xor <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = xor + %I16 = xor i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = xor + ; SSE42: cost of 1 {{.*}} %V8I16 = xor + ; AVX: cost of 1 {{.*}} %V8I16 = xor + ; AVX2: cost of 1 {{.*}} %V8I16 = xor + ; AVX512: cost of 1 {{.*}} %V8I16 = xor + %V8I16 = xor <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = xor + ; SSE42: cost of 2 {{.*}} %V16I16 = xor + ; AVX: cost of 1 {{.*}} %V16I16 = xor + ; AVX2: cost of 1 {{.*}} %V16I16 = xor + ; AVX512: cost of 1 {{.*}} %V16I16 = xor + %V16I16 = xor <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = xor + ; SSE42: cost of 4 {{.*}} %V32I16 = xor + ; AVX: cost of 2 {{.*}} %V32I16 = xor + ; AVX2: cost of 2 {{.*}} %V32I16 = xor + ; AVX512F: cost of 2 {{.*}} %V32I16 = xor + ; AVX512BW: cost of 1 {{.*}} %V32I16 = xor + %V32I16 = xor <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = xor + %I8 = xor i8 undef, undef + ; SSSE3: cost of 1 {{.*}} %V16I8 = xor + ; SSE42: cost of 1 {{.*}} %V16I8 = xor + ; AVX: cost of 1 {{.*}} %V16I8 = xor + ; AVX2: cost of 1 {{.*}} %V16I8 = xor + ; AVX512: cost of 1 {{.*}} %V16I8 = xor + %V16I8 = xor <16 x i8> undef, undef + ; SSSE3: cost of 2 {{.*}} %V32I8 = xor + ; SSE42: cost of 2 {{.*}} %V32I8 = xor + ; AVX: cost of 1 {{.*}} %V32I8 = xor + ; AVX2: cost of 1 {{.*}} %V32I8 = xor + ; AVX512: cost of 1 {{.*}} %V32I8 = xor + %V32I8 = xor <32 x i8> undef, undef + ; SSSE3: cost of 4 {{.*}} %V64I8 = xor + ; SSE42: cost of 4 {{.*}} %V64I8 = xor + ; AVX: cost of 2 {{.*}} %V64I8 = xor + ; AVX2: cost of 2 {{.*}} %V64I8 = xor + ; AVX512F: cost of 2 {{.*}} %V64I8 = xor + ; AVX512BW: cost of 1 {{.*}} %V64I8 = xor + %V64I8 = xor <64 x i8> undef, undef ret i32 undef } ; CHECK-LABEL: 'and' define i32 @and(i32 %arg) { - ; SSSE3: cost of 1 {{.*}} %A = and - ; SSE42: cost of 1 {{.*}} %A = and - ; AVX: cost of 1 {{.*}} %A = and - ; AVX2: cost of 1 {{.*}} %A = and - ; AVX512: cost of 1 {{.*}} %A = and - %A = and <2 x i64> undef, undef - ; SSSE3: cost of 2 {{.*}} %B = and - ; SSE42: cost of 2 {{.*}} %B = and - ; AVX: cost of 1 {{.*}} %B = and - ; AVX2: cost of 1 {{.*}} %B = and - ; AVX512: cost of 1 {{.*}} %B = and - %B = and <4 x i64> undef, undef - ; SSSE3: cost of 4 {{.*}} %C = and - ; SSE42: cost of 4 {{.*}} %C = and - ; AVX: cost of 2 {{.*}} %C = and - ; AVX2: cost of 2 {{.*}} %C = and - ; AVX512: cost of 1 {{.*}} %C = and - %C = and <8 x i64> undef, undef - - ; SSSE3: cost of 1 {{.*}} %D = and - ; SSE42: cost of 1 {{.*}} %D = and - ; AVX: cost of 1 {{.*}} %D = and - ; AVX2: cost of 1 {{.*}} %D = and - ; AVX512: cost of 1 {{.*}} %D = and - %D = and <4 x i32> undef, undef - ; SSSE3: cost of 2 {{.*}} %E = and - ; SSE42: cost of 2 {{.*}} %E = and - ; AVX: cost of 1 {{.*}} %E = and - ; AVX2: cost of 1 {{.*}} %E = and - ; AVX512: cost of 1 {{.*}} %E = and - %E = and <8 x i32> undef, undef - ; SSSE3: cost of 4 {{.*}} %F = and - ; SSE42: cost of 4 {{.*}} %F = and - ; AVX: cost of 2 {{.*}} %F = and - ; AVX2: cost of 2 {{.*}} %F = and - ; AVX512: cost of 1 {{.*}} %F = and - %F = and <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = and - ; SSE42: cost of 1 {{.*}} %G = and - ; AVX: cost of 1 {{.*}} %G = and - ; AVX2: cost of 1 {{.*}} %G = and - ; AVX512: cost of 1 {{.*}} %G = and - %G = and <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = and - ; SSE42: cost of 2 {{.*}} %H = and - ; AVX: cost of 1 {{.*}} %H = and - ; AVX2: cost of 1 {{.*}} %H = and - ; AVX512: cost of 1 {{.*}} %H = and - %H = and <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = and - ; SSE42: cost of 4 {{.*}} %I = and - ; AVX: cost of 2 {{.*}} %I = and - ; AVX2: cost of 2 {{.*}} %I = and - ; AVX512F: cost of 2 {{.*}} %I = and - ; AVX512BW: cost of 1 {{.*}} %I = and - %I = and <32 x i16> undef, undef - - ; SSSE3: cost of 1 {{.*}} %J = and - ; SSE42: cost of 1 {{.*}} %J = and - ; AVX: cost of 1 {{.*}} %J = and - ; AVX2: cost of 1 {{.*}} %J = and - ; AVX512: cost of 1 {{.*}} %J = and - %J = and <16 x i8> undef, undef - ; SSSE3: cost of 2 {{.*}} %K = and - ; SSE42: cost of 2 {{.*}} %K = and - ; AVX: cost of 1 {{.*}} %K = and - ; AVX2: cost of 1 {{.*}} %K = and - ; AVX512: cost of 1 {{.*}} %K = and - %K = and <32 x i8> undef, undef - ; SSSE3: cost of 4 {{.*}} %L = and - ; SSE42: cost of 4 {{.*}} %L = and - ; AVX: cost of 2 {{.*}} %L = and - ; AVX2: cost of 2 {{.*}} %L = and - ; AVX512F: cost of 2 {{.*}} %L = and - ; AVX512BW: cost of 1 {{.*}} %L = and - %L = and <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = and + %I64 = and i64 undef, undef + ; SSSE3: cost of 1 {{.*}} %V2I64 = and + ; SSE42: cost of 1 {{.*}} %V2I64 = and + ; AVX: cost of 1 {{.*}} %V2I64 = and + ; AVX2: cost of 1 {{.*}} %V2I64 = and + ; AVX512: cost of 1 {{.*}} %V2I64 = and + %V2I64 = and <2 x i64> undef, undef + ; SSSE3: cost of 2 {{.*}} %V4I64 = and + ; SSE42: cost of 2 {{.*}} %V4I64 = and + ; AVX: cost of 1 {{.*}} %V4I64 = and + ; AVX2: cost of 1 {{.*}} %V4I64 = and + ; AVX512: cost of 1 {{.*}} %V4I64 = and + %V4I64 = and <4 x i64> undef, undef + ; SSSE3: cost of 4 {{.*}} %V8I64 = and + ; SSE42: cost of 4 {{.*}} %V8I64 = and + ; AVX: cost of 2 {{.*}} %V8I64 = and + ; AVX2: cost of 2 {{.*}} %V8I64 = and + ; AVX512: cost of 1 {{.*}} %V8I64 = and + %V8I64 = and <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = and + %I32 = and i32 undef, undef + ; SSSE3: cost of 1 {{.*}} %V4I32 = and + ; SSE42: cost of 1 {{.*}} %V4I32 = and + ; AVX: cost of 1 {{.*}} %V4I32 = and + ; AVX2: cost of 1 {{.*}} %V4I32 = and + ; AVX512: cost of 1 {{.*}} %V4I32 = and + %V4I32 = and <4 x i32> undef, undef + ; SSSE3: cost of 2 {{.*}} %V8I32 = and + ; SSE42: cost of 2 {{.*}} %V8I32 = and + ; AVX: cost of 1 {{.*}} %V8I32 = and + ; AVX2: cost of 1 {{.*}} %V8I32 = and + ; AVX512: cost of 1 {{.*}} %V8I32 = and + %V8I32 = and <8 x i32> undef, undef + ; SSSE3: cost of 4 {{.*}} %V16I32 = and + ; SSE42: cost of 4 {{.*}} %V16I32 = and + ; AVX: cost of 2 {{.*}} %V16I32 = and + ; AVX2: cost of 2 {{.*}} %V16I32 = and + ; AVX512: cost of 1 {{.*}} %V16I32 = and + %V16I32 = and <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = and + %I16 = and i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = and + ; SSE42: cost of 1 {{.*}} %V8I16 = and + ; AVX: cost of 1 {{.*}} %V8I16 = and + ; AVX2: cost of 1 {{.*}} %V8I16 = and + ; AVX512: cost of 1 {{.*}} %V8I16 = and + %V8I16 = and <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = and + ; SSE42: cost of 2 {{.*}} %V16I16 = and + ; AVX: cost of 1 {{.*}} %V16I16 = and + ; AVX2: cost of 1 {{.*}} %V16I16 = and + ; AVX512: cost of 1 {{.*}} %V16I16 = and + %V16I16 = and <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = and + ; SSE42: cost of 4 {{.*}} %V32I16 = and + ; AVX: cost of 2 {{.*}} %V32I16 = and + ; AVX2: cost of 2 {{.*}} %V32I16 = and + ; AVX512F: cost of 2 {{.*}} %V32I16 = and + ; AVX512BW: cost of 1 {{.*}} %V32I16 = and + %V32I16 = and <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = and + %I8 = and i8 undef, undef + ; SSSE3: cost of 1 {{.*}} %V16I8 = and + ; SSE42: cost of 1 {{.*}} %V16I8 = and + ; AVX: cost of 1 {{.*}} %V16I8 = and + ; AVX2: cost of 1 {{.*}} %V16I8 = and + ; AVX512: cost of 1 {{.*}} %V16I8 = and + %V16I8 = and <16 x i8> undef, undef + ; SSSE3: cost of 2 {{.*}} %V32I8 = and + ; SSE42: cost of 2 {{.*}} %V32I8 = and + ; AVX: cost of 1 {{.*}} %V32I8 = and + ; AVX2: cost of 1 {{.*}} %V32I8 = and + ; AVX512: cost of 1 {{.*}} %V32I8 = and + %V32I8 = and <32 x i8> undef, undef + ; SSSE3: cost of 4 {{.*}} %V64I8 = and + ; SSE42: cost of 4 {{.*}} %V64I8 = and + ; AVX: cost of 2 {{.*}} %V64I8 = and + ; AVX2: cost of 2 {{.*}} %V64I8 = and + ; AVX512F: cost of 2 {{.*}} %V64I8 = and + ; AVX512BW: cost of 1 {{.*}} %V64I8 = and + %V64I8 = and <64 x i8> undef, undef ret i32 undef } ; CHECK-LABEL: 'mul' define i32 @mul(i32 %arg) { - ; SSSE3: cost of 8 {{.*}} %A = mul - ; SSE42: cost of 8 {{.*}} %A = mul - ; AVX: cost of 8 {{.*}} %A = mul - ; AVX2: cost of 8 {{.*}} %A = mul - ; AVX512F: cost of 8 {{.*}} %A = mul - ; AVX512BW: cost of 8 {{.*}} %A = mul - ; AVX512DQ: cost of 1 {{.*}} %A = mul - %A = mul <2 x i64> undef, undef - ; SSSE3: cost of 16 {{.*}} %B = mul - ; SSE42: cost of 16 {{.*}} %B = mul - ; AVX: cost of 18 {{.*}} %B = mul - ; AVX2: cost of 8 {{.*}} %B = mul - ; AVX512F: cost of 8 {{.*}} %B = mul - ; AVX512BW: cost of 8 {{.*}} %B = mul - ; AVX512DQ: cost of 1 {{.*}} %B = mul - %B = mul <4 x i64> undef, undef - ; SSSE3: cost of 32 {{.*}} %C = mul - ; SSE42: cost of 32 {{.*}} %C = mul - ; AVX: cost of 36 {{.*}} %C = mul - ; AVX2: cost of 16 {{.*}} %C = mul - ; AVX512F: cost of 8 {{.*}} %C = mul - ; AVX512BW: cost of 8 {{.*}} %C = mul - ; AVX512DQ: cost of 1 {{.*}} %C = mul - %C = mul <8 x i64> undef, undef - - ; SSSE3: cost of 6 {{.*}} %D = mul - ; SSE42: cost of 1 {{.*}} %D = mul - ; AVX: cost of 1 {{.*}} %D = mul - ; AVX2: cost of 1 {{.*}} %D = mul - ; AVX512: cost of 1 {{.*}} %D = mul - %D = mul <4 x i32> undef, undef - ; SSSE3: cost of 12 {{.*}} %E = mul - ; SSE42: cost of 2 {{.*}} %E = mul - ; AVX: cost of 4 {{.*}} %E = mul - ; AVX2: cost of 1 {{.*}} %E = mul - ; AVX512: cost of 1 {{.*}} %E = mul - %E = mul <8 x i32> undef, undef - ; SSSE3: cost of 24 {{.*}} %F = mul - ; SSE42: cost of 4 {{.*}} %F = mul - ; AVX: cost of 8 {{.*}} %F = mul - ; AVX2: cost of 2 {{.*}} %F = mul - ; AVX512: cost of 1 {{.*}} %F = mul - %F = mul <16 x i32> undef, undef - - ; SSSE3: cost of 1 {{.*}} %G = mul - ; SSE42: cost of 1 {{.*}} %G = mul - ; AVX: cost of 1 {{.*}} %G = mul - ; AVX2: cost of 1 {{.*}} %G = mul - ; AVX512: cost of 1 {{.*}} %G = mul - %G = mul <8 x i16> undef, undef - ; SSSE3: cost of 2 {{.*}} %H = mul - ; SSE42: cost of 2 {{.*}} %H = mul - ; AVX: cost of 4 {{.*}} %H = mul - ; AVX2: cost of 1 {{.*}} %H = mul - ; AVX512: cost of 1 {{.*}} %H = mul - %H = mul <16 x i16> undef, undef - ; SSSE3: cost of 4 {{.*}} %I = mul - ; SSE42: cost of 4 {{.*}} %I = mul - ; AVX: cost of 8 {{.*}} %I = mul - ; AVX2: cost of 2 {{.*}} %I = mul - ; AVX512F: cost of 2 {{.*}} %I = mul - ; AVX512BW: cost of 1 {{.*}} %I = mul - %I = mul <32 x i16> undef, undef - - ; SSSE3: cost of 12 {{.*}} %J = mul - ; SSE42: cost of 12 {{.*}} %J = mul - ; AVX: cost of 12 {{.*}} %J = mul - ; AVX2: cost of 7 {{.*}} %J = mul - ; AVX512F: cost of 5 {{.*}} %J = mul - ; AVX512BW: cost of 4 {{.*}} %J = mul - %J = mul <16 x i8> undef, undef - ; SSSE3: cost of 24 {{.*}} %K = mul - ; SSE42: cost of 24 {{.*}} %K = mul - ; AVX: cost of 26 {{.*}} %K = mul - ; AVX2: cost of 17 {{.*}} %K = mul - ; AVX512F: cost of 13 {{.*}} %K = mul - ; AVX512BW: cost of 4 {{.*}} %K = mul - %K = mul <32 x i8> undef, undef - ; SSSE3: cost of 48 {{.*}} %L = mul - ; SSE42: cost of 48 {{.*}} %L = mul - ; AVX: cost of 52 {{.*}} %L = mul - ; AVX2: cost of 34 {{.*}} %L = mul - ; AVX512F: cost of 26 {{.*}} %L = mul - ; AVX512BW: cost of 11 {{.*}} %L = mul - %L = mul <64 x i8> undef, undef + ; CHECK: cost of 1 {{.*}} %I64 = mul + %I64 = mul i64 undef, undef + ; SSSE3: cost of 8 {{.*}} %V2I64 = mul + ; SSE42: cost of 8 {{.*}} %V2I64 = mul + ; AVX: cost of 8 {{.*}} %V2I64 = mul + ; AVX2: cost of 8 {{.*}} %V2I64 = mul + ; AVX512F: cost of 8 {{.*}} %V2I64 = mul + ; AVX512BW: cost of 8 {{.*}} %V2I64 = mul + ; AVX512DQ: cost of 1 {{.*}} %V2I64 = mul + %V2I64 = mul <2 x i64> undef, undef + ; SSSE3: cost of 16 {{.*}} %V4I64 = mul + ; SSE42: cost of 16 {{.*}} %V4I64 = mul + ; AVX: cost of 18 {{.*}} %V4I64 = mul + ; AVX2: cost of 8 {{.*}} %V4I64 = mul + ; AVX512F: cost of 8 {{.*}} %V4I64 = mul + ; AVX512BW: cost of 8 {{.*}} %V4I64 = mul + ; AVX512DQ: cost of 1 {{.*}} %V4I64 = mul + %V4I64 = mul <4 x i64> undef, undef + ; SSSE3: cost of 32 {{.*}} %V8I64 = mul + ; SSE42: cost of 32 {{.*}} %V8I64 = mul + ; AVX: cost of 36 {{.*}} %V8I64 = mul + ; AVX2: cost of 16 {{.*}} %V8I64 = mul + ; AVX512F: cost of 8 {{.*}} %V8I64 = mul + ; AVX512BW: cost of 8 {{.*}} %V8I64 = mul + ; AVX512DQ: cost of 1 {{.*}} %V8I64 = mul + %V8I64 = mul <8 x i64> undef, undef + + ; CHECK: cost of 1 {{.*}} %I32 = mul + %I32 = mul i32 undef, undef + ; SSSE3: cost of 6 {{.*}} %V4I32 = mul + ; SSE42: cost of 1 {{.*}} %V4I32 = mul + ; AVX: cost of 1 {{.*}} %V4I32 = mul + ; AVX2: cost of 1 {{.*}} %V4I32 = mul + ; AVX512: cost of 1 {{.*}} %V4I32 = mul + %V4I32 = mul <4 x i32> undef, undef + ; SSSE3: cost of 12 {{.*}} %V8I32 = mul + ; SSE42: cost of 2 {{.*}} %V8I32 = mul + ; AVX: cost of 4 {{.*}} %V8I32 = mul + ; AVX2: cost of 1 {{.*}} %V8I32 = mul + ; AVX512: cost of 1 {{.*}} %V8I32 = mul + %V8I32 = mul <8 x i32> undef, undef + ; SSSE3: cost of 24 {{.*}} %V16I32 = mul + ; SSE42: cost of 4 {{.*}} %V16I32 = mul + ; AVX: cost of 8 {{.*}} %V16I32 = mul + ; AVX2: cost of 2 {{.*}} %V16I32 = mul + ; AVX512: cost of 1 {{.*}} %V16I32 = mul + %V16I32 = mul <16 x i32> undef, undef + + ; CHECK: cost of 1 {{.*}} %I16 = mul + %I16 = mul i16 undef, undef + ; SSSE3: cost of 1 {{.*}} %V8I16 = mul + ; SSE42: cost of 1 {{.*}} %V8I16 = mul + ; AVX: cost of 1 {{.*}} %V8I16 = mul + ; AVX2: cost of 1 {{.*}} %V8I16 = mul + ; AVX512: cost of 1 {{.*}} %V8I16 = mul + %V8I16 = mul <8 x i16> undef, undef + ; SSSE3: cost of 2 {{.*}} %V16I16 = mul + ; SSE42: cost of 2 {{.*}} %V16I16 = mul + ; AVX: cost of 4 {{.*}} %V16I16 = mul + ; AVX2: cost of 1 {{.*}} %V16I16 = mul + ; AVX512: cost of 1 {{.*}} %V16I16 = mul + %V16I16 = mul <16 x i16> undef, undef + ; SSSE3: cost of 4 {{.*}} %V32I16 = mul + ; SSE42: cost of 4 {{.*}} %V32I16 = mul + ; AVX: cost of 8 {{.*}} %V32I16 = mul + ; AVX2: cost of 2 {{.*}} %V32I16 = mul + ; AVX512F: cost of 2 {{.*}} %V32I16 = mul + ; AVX512BW: cost of 1 {{.*}} %V32I16 = mul + %V32I16 = mul <32 x i16> undef, undef + + ; CHECK: cost of 1 {{.*}} %I8 = mul + %I8 = mul i8 undef, undef + ; SSSE3: cost of 12 {{.*}} %V16I8 = mul + ; SSE42: cost of 12 {{.*}} %V16I8 = mul + ; AVX: cost of 12 {{.*}} %V16I8 = mul + ; AVX2: cost of 7 {{.*}} %V16I8 = mul + ; AVX512F: cost of 5 {{.*}} %V16I8 = mul + ; AVX512BW: cost of 4 {{.*}} %V16I8 = mul + %V16I8 = mul <16 x i8> undef, undef + ; SSSE3: cost of 24 {{.*}} %V32I8 = mul + ; SSE42: cost of 24 {{.*}} %V32I8 = mul + ; AVX: cost of 26 {{.*}} %V32I8 = mul + ; AVX2: cost of 17 {{.*}} %V32I8 = mul + ; AVX512F: cost of 13 {{.*}} %V32I8 = mul + ; AVX512BW: cost of 4 {{.*}} %V32I8 = mul + %V32I8 = mul <32 x i8> undef, undef + ; SSSE3: cost of 48 {{.*}} %V64I8 = mul + ; SSE42: cost of 48 {{.*}} %V64I8 = mul + ; AVX: cost of 52 {{.*}} %V64I8 = mul + ; AVX2: cost of 34 {{.*}} %V64I8 = mul + ; AVX512F: cost of 26 {{.*}} %V64I8 = mul + ; AVX512BW: cost of 11 {{.*}} %V64I8 = mul + %V64I8 = mul <64 x i8> undef, undef ret i32 undef } Index: test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll =================================================================== --- test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1247,6 +1247,18 @@ ret float %res } +declare float @llvm.fma.f32(float, float, float) +define float @test_fma_intrin(float %a, float %b, float %c) { +; CHECK-LABEL: name: test_fma_intrin +; CHECK: [[A:%[0-9]+]](s32) = COPY %s0 +; CHECK: [[B:%[0-9]+]](s32) = COPY %s1 +; CHECK: [[C:%[0-9]+]](s32) = COPY %s2 +; CHECK: [[RES:%[0-9]+]](s32) = G_FMA [[A]], [[B]], [[C]] +; CHECK: %s0 = COPY [[RES]] + %res = call float @llvm.fma.f32(float %a, float %b, float %c) + ret float %res +} + declare void @llvm.lifetime.start.p0i8(i64, i8*) declare void @llvm.lifetime.end.p0i8(i64, i8*) define void @test_lifetime_intrin() { Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1378,7 +1378,7 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: -; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0] +; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d entry: %vecext = extractelement <2 x i64> %x, i32 0 %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0 Index: test/CodeGen/AArch64/swiftself-scavenger.ll =================================================================== --- test/CodeGen/AArch64/swiftself-scavenger.ll +++ test/CodeGen/AArch64/swiftself-scavenger.ll @@ -5,7 +5,7 @@ ; CHECK: str [[REG:x[0-9]+]], [sp, #8] ; CHECK: add [[REG]], sp, #248 ; CHECK: str xzr, [{{\s*}}[[REG]], #32760] -; CHECK: ldr x30, [sp, #8] +; CHECK: ldr [[REG]], [sp, #8] target triple = "arm64-apple-ios" @ptr8 = external global i8* Index: test/CodeGen/AArch64/xray-attribute-instrumentation.ll =================================================================== --- test/CodeGen/AArch64/xray-attribute-instrumentation.ll +++ test/CodeGen/AArch64/xray-attribute-instrumentation.ll @@ -25,9 +25,9 @@ ; CHECK-NEXT: ret } ; CHECK: .p2align 4 -; CHECK-NEXT: .xword .Lxray_synthetic_0 ; CHECK-NEXT: .xword .Lxray_fn_idx_synth_0 ; CHECK-NEXT: .section xray_instr_map,{{.*}} -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0 ; CHECK: .xword .Lxray_sled_0 ; CHECK: .xword .Lxray_sled_1 +; CHECK-LABEL: Lxray_sleds_end0 Index: test/CodeGen/AArch64/xray-tail-call-sled.ll =================================================================== --- test/CodeGen/AArch64/xray-tail-call-sled.ll +++ test/CodeGen/AArch64/xray-tail-call-sled.ll @@ -28,21 +28,20 @@ ; CHECK-NEXT: ret } ; CHECK: .p2align 4 -; CHECK-NEXT: .xword .Lxray_synthetic_0 ; CHECK-NEXT: .xword .Lxray_fn_idx_synth_0 ; CHECK-NEXT: .section xray_instr_map,{{.*}} -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .xword .Lxray_sled_0 ; CHECK: .xword .Lxray_sled_1 -; CHECK-LABEL: Lxray_synthetic_end0: +; CHECK-LABEL: Lxray_sleds_end0: ; CHECK: .section xray_fn_idx,{{.*}} ; CHECK-LABEL: Lxray_fn_idx_synth_0: -; CHECK: .xword .Lxray_synthetic_0 -; CHECK-NEXT: .xword .Lxray_synthetic_end0 +; CHECK: .xword .Lxray_sleds_start0 +; CHECK-NEXT: .xword .Lxray_sleds_end0 define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" { ; CHECK: .p2align 2 -; CHECK-LABEL: .Lxray_sled_2: +; CHECK-LABEL: Lxray_sled_2: ; CHECK-NEXT: b #32 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -53,7 +52,7 @@ ; CHECK-NEXT: nop ; CHECK-LABEL: .Ltmp2: ; CHECK: .p2align 2 -; CHECK-LABEL: .Lxray_sled_3: +; CHECK-LABEL: Lxray_sled_3: ; CHECK-NEXT: b #32 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -68,14 +67,13 @@ ret i32 %retval } ; CHECK: .p2align 4 -; CHECK-NEXT: .xword .Lxray_synthetic_1 ; CHECK-NEXT: .xword .Lxray_fn_idx_synth_1 ; CHECK-NEXT: .section xray_instr_map,{{.*}} -; CHECK-LABEL: Lxray_synthetic_1: +; CHECK-LABEL: Lxray_sleds_start1: ; CHECK: .xword .Lxray_sled_2 ; CHECK: .xword .Lxray_sled_3 -; CHECK-LABEL: Lxray_synthetic_end1: +; CHECK-LABEL: Lxray_sleds_end1: ; CHECK: .section xray_fn_idx,{{.*}} ; CHECK-LABEL: Lxray_fn_idx_synth_1: -; CHECK: .xword .Lxray_synthetic_1 -; CHECK-NEXT: .xword .Lxray_synthetic_end1 +; CHECK: .xword .Lxray_sleds_start1 +; CHECK-NEXT: .xword .Lxray_sleds_end1 Index: test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -68,6 +68,10 @@ ret void } + define amdgpu_kernel void @undefined_vreg_operand() { + unreachable + } + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } @@ -856,3 +860,26 @@ S_ENDPGM ... +--- +# There is only an undef use operand for %1, so there is no +# corresponding defining instruction + +# GCN-LABEL: name: undefined_vreg_operand{{$}} +# GCN: bb.0 +# GCN-NEXT: FLAT_STORE_DWORD undef %3, undef %1, +# GCN-NEXT: S_ENDPGM +name: undefined_vreg_operand +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: vgpr_32, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: vreg_64, preferred-register: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %2 = V_XOR_B32_e64 killed %0, undef %1, implicit %exec + FLAT_STORE_DWORD undef %3, %2, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/fold-operands-order.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fold-operands-order.mir @@ -0,0 +1,47 @@ +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @mov_in_use_list_2x() { + unreachable + } + +... +--- + +# Blocks should be processed in program order to make sure folds +# aren't made in users before the def is seen. + +# GCN-LABEL: name: mov_in_use_list_2x{{$}} +# GCN: %2 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %3 = COPY undef %0 + +# GCN: %1 = V_MOV_B32_e32 0, implicit %exec + + +name: mov_in_use_list_2x +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: vgpr_32, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: vgpr_32, preferred-register: '' } +liveins: +body: | + bb.0: + successors: %bb.2 + + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + %2 = COPY %1 + %3 = V_XOR_B32_e64 killed %2, undef %0, implicit %exec + + bb.2: + successors: %bb.1 + + %1 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.1 + +... Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -6,9 +6,9 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { @@ -22,9 +22,9 @@ ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -39,9 +39,9 @@ ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NEXT: v_mul_lo_i32 v0, v0, 9 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -71,8 +71,8 @@ ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: v_lshr_b32_e64 v0, vcc_hi, 6 +; GCN-NEXT: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -99,8 +99,8 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN: v_lshr_b32_e64 v1, vcc_hi, 6 +; GCN: s_sub_u32 s6, s5, s4 +; GCN: v_lshr_b32_e64 v1, s6, 6 ; GCN: s_and_saveexec_b64 ; GCN: v_add_i32_e32 v0, vcc, 4, v1 @@ -123,10 +123,10 @@ ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], vcc_hi, 6 -; GCN-DAG: s_movk_i32 vcc_hi, 0x204 -; GCN: v_add_i32_e32 v0, vcc, vcc_hi, [[SCALED]] +; GCN: s_sub_u32 s6, s5, s4 +; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-DAG: s_movk_i32 s6, 0x204 +; GCN: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]] ; GCN: v_mul_lo_i32 v0, v0, 9 ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { Index: test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll +++ test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -2,7 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}test1: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, @@ -11,8 +11,38 @@ ret void } +;CHECK-LABEL: {{^}}test1_idx: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc +define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test1_scalar_offset: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc +define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test1_no_glc_slc: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 +define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0, + i32 0, i32 0) + ret void +} + ;CHECK-LABEL: {{^}}test2: -;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, @@ -22,7 +52,7 @@ } ;CHECK-LABEL: {{^}}test3: -;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, @@ -32,7 +62,7 @@ } ;CHECK-LABEL: {{^}}test4: -;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -0,0 +1,109 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1) + %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_idx: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_both: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -0,0 +1,110 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1) + call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_idx: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_both: +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +; GCN-LABEL: {{^}}buffer_store_wait: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; GCN: s_waitcnt expcnt(0) +; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; GCN: s_waitcnt vmcnt(0) +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + %data.i = bitcast <4 x float> %data to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + Index: test/CodeGen/AMDGPU/merge-store-crash.ll =================================================================== --- test/CodeGen/AMDGPU/merge-store-crash.ll +++ test/CodeGen/AMDGPU/merge-store-crash.ll @@ -26,11 +26,11 @@ %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1 %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2 %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1) ret void } ; Function Attrs: nounwind -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/merge-store-usedef.ll =================================================================== --- test/CodeGen/AMDGPU/merge-store-usedef.ll +++ test/CodeGen/AMDGPU/merge-store-usedef.ll @@ -11,13 +11,13 @@ store i32 %v, i32 addrspace(3)* %p0 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0) %w = load i32, i32 addrspace(3)* %p0 store i32 %w, i32 addrspace(3)* %p1 ret void } -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -62,7 +62,8 @@ %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -80,7 +81,8 @@ %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -175,6 +177,6 @@ } declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -0,0 +1,341 @@ +# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# Check that constant is in SGPR registers + +# GCN-LABEL: {{^}}name: const_to_sgpr{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + + +# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + +# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}} +# GCN: %[[OP0:[0-9]+]] = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2 +# GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec + +--- | + define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + + define amdgpu_kernel void @const_to_sgpr_multiple_use(i32 addrspace(1)* nocapture %arg, i64 %id1, i64 %id2) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + + define amdgpu_kernel void @const_to_sgpr_subreg(i32 addrspace(1)* nocapture %arg, i64 %id) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + +... +--- +name: const_to_sgpr +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_64 } + - { id: 18, class: sreg_32_xm0 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_64 } + - { id: 21, class: sreg_64 } + - { id: 22, class: vreg_64 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sgpr_64 } + - { id: 28, class: sgpr_128 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %6 = COPY %7 + %9 = S_MOV_B32 0 + %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %0 = COPY %10 + %11 = COPY %10.sub0 + %12 = COPY %10.sub1 + %13 = COPY %8.sub0 + %14 = COPY %8.sub1 + %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc + %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc + %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %18 = S_MOV_B32 0 + %19 = S_MOV_B32 1048576 + %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %22 = COPY killed %20 + %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec + %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %23 = S_MOV_B32 2 + %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc + %25 = S_MOV_B32 61440 + %26 = S_MOV_B32 0 + %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %28 = REG_SEQUENCE %6, 17, killed %27, 18 + %29 = V_MOV_B32_e32 0, implicit %exec + %30 = COPY %24 + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... +--- +name: const_to_sgpr_multiple_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_64_xexec } + - { id: 10, class: sreg_32 } + - { id: 11, class: sreg_64 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_32_xm0 } + - { id: 18, class: sreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32_xm0 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_64 } + - { id: 24, class: sreg_32_xm0 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_64 } + - { id: 27, class: sreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: sreg_64 } + - { id: 30, class: vreg_64 } + - { id: 31, class: sreg_64 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_64 } + - { id: 34, class: sreg_32_xm0 } + - { id: 35, class: sreg_32_xm0 } + - { id: 36, class: sgpr_64 } + - { id: 37, class: sgpr_128 } + - { id: 38, class: vgpr_32 } + - { id: 39, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %9 = S_LOAD_DWORDX2_IMM %3, 13, 0 + %6 = COPY %7 + %10 = S_MOV_B32 0 + %11 = REG_SEQUENCE %2, 1, killed %10, 2 + %0 = COPY %11 + %12 = COPY %11.sub0 + %13 = COPY %11.sub1 + %14 = COPY %8.sub0 + %15 = COPY %8.sub1 + %16 = S_ADD_U32 %12, killed %14, implicit-def %scc + %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc + %18 = REG_SEQUENCE killed %16, 1, killed %17, 2 + %19 = COPY %9.sub0 + %20 = COPY %9.sub1 + %21 = S_ADD_U32 %12, killed %19, implicit-def %scc + %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc + %23 = REG_SEQUENCE killed %21, 1, killed %22, 2 + %24 = S_MOV_B32 0 + %25 = S_MOV_B32 1048576 + %26 = REG_SEQUENCE killed %25, 1, killed %24, 2 + %28 = COPY %26 + %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec + %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec + %31 = S_AND_B64 killed %27, killed %29, implicit-def dead %scc + %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %32 = S_MOV_B32 2 + %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc + %34 = S_MOV_B32 61440 + %35 = S_MOV_B32 0 + %36 = REG_SEQUENCE killed %35, 1, killed %34, 2 + %37 = REG_SEQUENCE %6, 17, killed %36, 18 + %38 = V_MOV_B32_e32 0, implicit %exec + %39 = COPY %33 + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... +--- +name: const_to_sgpr_subreg +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_64 } + - { id: 18, class: sreg_32_xm0 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_64 } + - { id: 21, class: sreg_64 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sgpr_64 } + - { id: 28, class: sgpr_128 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %6 = COPY %7 + %9 = S_MOV_B32 0 + %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %0 = COPY %10 + %11 = COPY %10.sub0 + %12 = COPY %10.sub1 + %13 = COPY %8.sub0 + %14 = COPY %8.sub1 + %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc + %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc + %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %18 = S_MOV_B32 12 + %19 = S_MOV_B32 1048576 + %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %22 = COPY killed %20.sub1 + %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec + %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %23 = S_MOV_B32 2 + %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc + %25 = S_MOV_B32 61440 + %26 = S_MOV_B32 0 + %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %28 = REG_SEQUENCE %6, 17, killed %27, 18 + %29 = V_MOV_B32_e32 0, implicit %exec + %30 = COPY %24 + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/scheduler-subrange-crash.ll =================================================================== --- test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -25,29 +25,29 @@ %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1) %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32> %tmp5 = extractelement <4 x i32> %bc49, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1) %array_vector21 = insertelement <4 x float> , float %tmp, i32 1 %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2 %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1) %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> %tmp6 = extractelement <4 x i32> %bc52, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1) ret void } declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- test/CodeGen/AMDGPU/sdwa-peephole.ll +++ test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -400,9 +400,9 @@ ; Check that "pulling out" SDWA operands works correctly. ; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_and_b32_sdwa Index: test/CodeGen/AMDGPU/shrink-carry.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shrink-carry.mir @@ -0,0 +1,101 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: subbrev{{$}} +# GCN: V_SUBBREV_U32_e64 0, undef %vgpr0, killed %vcc, implicit %exec + +--- +name: subbrev +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: subb{{$}} +# GCN: V_SUBB_U32_e64 undef %vgpr0, 0, killed %vcc, implicit %exec + +--- +name: subb +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_SUBB_U32_e64 %0, 0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: addc{{$}} +# GCN: V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec + +--- +name: addc +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_ADDC_U32_e64 0, %0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: addc2{{$}} +# GCN: V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec + +--- +name: addc2 +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_ADDC_U32_e64 %0, 0, %3, implicit %exec + S_ENDPGM + +... Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 @@ -258,9 +258,8 @@ ; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 ; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 -; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, -; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, -; i32 1, i32 0) +; call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, +; i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1) ; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -22,7 +22,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], @@ -57,7 +57,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], Index: test/CodeGen/AMDGPU/spill-m0.ll =================================================================== --- test/CodeGen/AMDGPU/spill-m0.ll +++ test/CodeGen/AMDGPU/spill-m0.ll @@ -119,10 +119,10 @@ ; GCN: ; clobber m0 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 s2, m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_mov_b32 m0, s2 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz @@ -170,10 +170,10 @@ ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 s0, m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_mov_b32 m0, s0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM: ds_write_b64 Index: test/CodeGen/AMDGPU/uint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -19,7 +19,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]] @@ -50,7 +50,7 @@ ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]] Index: test/CodeGen/ARM/v6m-umul-with-overflow.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/v6m-umul-with-overflow.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=thumbv6m-none-eabi | FileCheck %s + +define i1 @unsigned_multiplication_did_overflow(i32, i32) { +; CHECK-LABEL: unsigned_multiplication_did_overflow: +entry-block: + %2 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1) + %3 = extractvalue { i32, i1 } %2, 1 + ret i1 %3 + +; CHECK: mov{{s?}} r2, r1 +; CHECK: mov{{s?}} r1, #0 +; CHECK: mov{{s?}} r3, {{#0|r1}} +; CHECK: bl __aeabi_lmul +} + +declare { i32, i1 } @llvm.umul.with.overflow.i32(i32, i32) Index: test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll =================================================================== --- test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll +++ test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll @@ -24,14 +24,13 @@ ; CHECK-NEXT: bx lr } ; CHECK: .p2align 4 -; CHECK-NEXT: .long {{.*}}Lxray_synthetic_0 ; CHECK-NEXT: .long {{.*}}Lxray_fn_idx_synth_0 ; CHECK-NEXT: .section {{.*}}xray_instr_map{{.*}} -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .long {{.*}}Lxray_sled_0 ; CHECK: .long {{.*}}Lxray_sled_1 -; CHECK-LABEL: Lxray_synthetic_end0: +; CHECK-LABEL: Lxray_sleds_end0: ; CHECK: .section {{.*}}xray_fn_idx{{.*}} ; CHECK-LABEL: Lxray_fn_idx_synth_0: -; CHECK: .long {{.*}}Lxray_synthetic_0 -; CHECK-NEXT: .long {{.*}}Lxray_synthetic_end0 +; CHECK: .long {{.*}}Lxray_sleds_start0 +; CHECK-NEXT: .long {{.*}}Lxray_sleds_end0 Index: test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll =================================================================== --- test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll +++ test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll @@ -24,15 +24,14 @@ ; CHECK-NEXT: bx lr } ; CHECK: .p2align 4 -; CHECK-NEXT: .long {{.*}}Lxray_synthetic_0 ; CHECK-NEXT: .long {{.*}}Lxray_fn_idx_synth_0 ; CHECK-NEXT: .section {{.*}}xray_instr_map{{.*}} -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .long {{.*}}Lxray_sled_0 ; CHECK: .long {{.*}}Lxray_sled_1 -; CHECK-LABEL: Lxray_synthetic_end0: +; CHECK-LABEL: Lxray_sleds_end0: ; CHECK: .section {{.*}}xray_fn_idx{{.*}} ; CHECK-LABEL: Lxray_fn_idx_synth_0: -; CHECK: .long {{.*}}xray_synthetic_0 -; CHECK-NEXT: .long {{.*}}xray_synthetic_end0 +; CHECK: .long {{.*}}xray_sleds_start0 +; CHECK-NEXT: .long {{.*}}xray_sleds_end0 Index: test/CodeGen/PowerPC/2010-02-12-saveCR.ll =================================================================== --- test/CodeGen/PowerPC/2010-02-12-saveCR.ll +++ test/CodeGen/PowerPC/2010-02-12-saveCR.ll @@ -8,15 +8,15 @@ ; Note that part of what is being checked here is proper register reuse. ; CHECK: mfcr [[T1:r[0-9]+]] ; cr2 ; CHECK: lis [[T2:r[0-9]+]], 1 -; CHECK: addi r3, r1, 72 ; CHECK: rotlwi [[T1]], [[T1]], 8 ; CHECK: ori [[T2]], [[T2]], 34540 ; CHECK: stwx [[T1]], r1, [[T2]] -; CHECK: lis [[T3:r[0-9]+]], 1 ; CHECK: mfcr [[T4:r[0-9]+]] ; cr3 -; CHECK: ori [[T3]], [[T3]], 34536 +; CHECK: lis [[T3:r[0-9]+]], 1 ; CHECK: rotlwi [[T4]], [[T4]], 12 +; CHECK: ori [[T3]], [[T3]], 34536 ; CHECK: stwx [[T4]], r1, [[T3]] +; CHECK: addi r3, r1, 72 %x = alloca [100000 x i8] ; <[100000 x i8]*> [#uses=1] %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] %x1 = bitcast [100000 x i8]* %x to i8* ; [#uses=1] Index: test/CodeGen/PowerPC/vsx-spill.ll =================================================================== --- test/CodeGen/PowerPC/vsx-spill.ll +++ test/CodeGen/PowerPC/vsx-spill.ll @@ -23,9 +23,9 @@ ; CHECK-REG: blr ; CHECK-FISL: @foo1 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65384 -; CHECK-FISL: stxsdx 1, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65384 +; CHECK-FISL: stxsdx 1, 1, 3 ; CHECK-FISL: blr ; CHECK-P9-REG: @foo1 @@ -54,8 +54,8 @@ ; CHECK-FISL: @foo2 ; CHECK-FISL: xsadddp [[R1:[0-9]+]], 1, 1 -; CHECK-FISL: stxsdx [[R1]], [[R1]], 0 -; CHECK-FISL: lxsdx [[R1]], [[R1]], 0 +; CHECK-FISL: stxsdx [[R1]], [[R1]], 3 +; CHECK-FISL: lxsdx [[R1]], [[R1]], 3 ; CHECK-FISL: blr ; CHECK-P9-REG: @foo2 Index: test/CodeGen/PowerPC/vsx.ll =================================================================== --- test/CodeGen/PowerPC/vsx.ll +++ test/CodeGen/PowerPC/vsx.ll @@ -235,9 +235,9 @@ ; CHECK-FISL-LABEL: @test14 ; CHECK-FISL: xxlor 0, 34, 35 ; CHECK-FISL: xxlnor 34, 34, 35 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65520 -; CHECK-FISL: stxvd2x 0, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65520 +; CHECK-FISL: stxvd2x 0, 1, 3 ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test14 @@ -260,9 +260,9 @@ ; CHECK-FISL: xxlor 36, 0, 0 ; CHECK-FISL: xxlnor 0, 34, 35 ; CHECK-FISL: xxlor 34, 0, 0 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65520 -; CHECK-FISL: stxvd2x 36, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65520 +; CHECK-FISL: stxvd2x 36, 1, 3 ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test15 @@ -285,9 +285,9 @@ ; CHECK-FISL: xxlor 36, 0, 0 ; CHECK-FISL: xxlnor 0, 34, 35 ; CHECK-FISL: xxlor 34, 0, 0 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65520 -; CHECK-FISL: stxvd2x 36, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65520 +; CHECK-FISL: stxvd2x 36, 1, 3 ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test16 @@ -330,9 +330,9 @@ ; CHECK-FISL: xxlor 36, 0, 0 ; CHECK-FISL: xxlandc 0, 34, 35 ; CHECK-FISL: xxlor 34, 0, 0 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65520 -; CHECK-FISL: stxvd2x 36, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65520 +; CHECK-FISL: stxvd2x 36, 1, 3 ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test18 @@ -355,9 +355,9 @@ ; CHECK-FISL: xxlor 36, 0, 0 ; CHECK-FISL: xxlandc 0, 34, 35 ; CHECK-FISL: xxlor 34, 0, 0 -; CHECK-FISL: lis 0, -1 -; CHECK-FISL: ori 0, 0, 65520 -; CHECK-FISL: stxvd2x 36, 1, 0 +; CHECK-FISL: lis 3, -1 +; CHECK-FISL: ori 3, 3, 65520 +; CHECK-FISL: stxvd2x 36, 1, 3 ; CHECK-FISL: blr ; CHECK-LE-LABEL: @test19 Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -714,22 +714,13 @@ ; ; AVX1-LABEL: _clearupper8xi32b: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper8xi32b: ; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %x16 = bitcast <8 x i32> %0 to <16 x i16> %r0 = insertelement <16 x i16> %x16, i16 zeroinitializer, i32 1 Index: test/CodeGen/X86/insertelement-zero.ll =================================================================== --- test/CodeGen/X86/insertelement-zero.ll +++ test/CodeGen/X86/insertelement-zero.ll @@ -405,25 +405,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX: # BB#0: +; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 %3 = insertelement <16 x i16> %2, i16 0, i32 15 Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -12,19 +12,46 @@ define i32 @length2(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length2: -; X32: # BB#0: -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $2 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll memcmp -; X32-NEXT: addl $16, %esp +; X32: # BB#0: # %loadbb +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movzwl (%ecx), %ecx +; X32-NEXT: movzwl (%eax), %eax +; X32-NEXT: rolw $8, %cx +; X32-NEXT: rolw $8, %ax +; X32-NEXT: movzwl %cx, %ecx +; X32-NEXT: movzwl %ax, %eax +; X32-NEXT: cmpl %eax, %ecx +; X32-NEXT: je .LBB0_1 +; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: movl $-1, %eax +; X32-NEXT: jb .LBB0_4 +; X32-NEXT: # BB#3: # %res_block +; X32-NEXT: movl $1, %eax +; X32-NEXT: .LBB0_4: # %endblock +; X32-NEXT: retl +; X32-NEXT: .LBB0_1: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length2: -; X64: # BB#0: -; X64-NEXT: movl $2, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzwl (%rsi), %ecx +; X64-NEXT: rolw $8, %ax +; X64-NEXT: rolw $8, %cx +; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %cx, %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB0_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m } @@ -145,19 +172,42 @@ define i32 @length4(i8* %X, i8* %Y) nounwind { ; X32-LABEL: length4: -; X32: # BB#0: -; X32-NEXT: pushl $0 -; X32-NEXT: pushl $4 -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: pushl {{[0-9]+}}(%esp) -; X32-NEXT: calll memcmp -; X32-NEXT: addl $16, %esp +; X32: # BB#0: # %loadbb +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: bswapl %ecx +; X32-NEXT: bswapl %eax +; X32-NEXT: cmpl %eax, %ecx +; X32-NEXT: je .LBB6_1 +; X32-NEXT: # BB#2: # %res_block +; X32-NEXT: movl $-1, %eax +; X32-NEXT: jb .LBB6_4 +; X32-NEXT: # BB#3: # %res_block +; X32-NEXT: movl $1, %eax +; X32-NEXT: .LBB6_4: # %endblock +; X32-NEXT: retl +; X32-NEXT: .LBB6_1: +; X32-NEXT: xorl %eax, %eax ; X32-NEXT: retl ; ; X64-LABEL: length4: -; X64: # BB#0: -; X64-NEXT: movl $4, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: bswapl %eax +; X64-NEXT: bswapl %ecx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB6_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB6_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind ret i32 %m } @@ -259,9 +309,21 @@ ; X32-NEXT: retl ; ; X64-LABEL: length8: -; X64: # BB#0: -; X64-NEXT: movl $8, %edx -; X64-NEXT: jmp memcmp # TAILCALL +; X64: # BB#0: # %loadbb +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: bswapq %rax +; X64-NEXT: bswapq %rcx +; X64-NEXT: cmpq %rcx, %rax +; X64-NEXT: je .LBB11_1 +; X64-NEXT: # BB#2: # %res_block +; X64-NEXT: movl $-1, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: retq +; X64-NEXT: .LBB11_1: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind ret i32 %m } Index: test/CodeGen/X86/non-value-mem-operand.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/non-value-mem-operand.mir @@ -0,0 +1,293 @@ +# RUN: llc -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s + +# CHECK-NOT: FAULTING_OP + +--- | + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + @global = external global i8* + @global.1 = external global i8* + + declare i8* @ham(i8*, i8**) + + define void @eggs(i8* %arg) gc "statepoint-example" { + bb: + %tmp = call i8* undef(i8* undef, i8** undef) + %tmp1 = icmp eq i8* %tmp, null + br i1 %tmp1, label %bb2, label %bb3, !make.implicit !0 + + bb2: ; preds = %bb + br i1 undef, label %bb51, label %bb59 + + bb3: ; preds = %bb + %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 16 + %tmp5 = bitcast i8* %tmp4 to i64* + br label %bb7 + + bb7: ; preds = %bb37, %bb3 + %tmp8 = phi i64* [ %tmp5, %bb3 ], [ %tmp18, %bb37 ] + %tmp10 = phi i32 [ undef, %bb3 ], [ %tmp48, %bb37 ] + %tmp12 = phi i32 [ 0, %bb3 ], [ 6, %bb37 ] + %tmp13 = phi double [ 0.000000e+00, %bb3 ], [ 2.000000e+00, %bb37 ] + %tmp14 = zext i32 %tmp10 to i64 + br i1 undef, label %bb26, label %bb15 + + bb15: ; preds = %bb7 + %tmp16 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 2882400000, i32 0, void ()* nonnull @wibble, i32 0, i32 0, i32 0, i32 30, i32 1, i32 0, i32 99, i32 0, i32 12, i32 0, i32 10, i32 %tmp10, i32 10, i32 0, i32 10, i32 %tmp12, i32 10, i32 undef, i32 6, float undef, i32 7, double %tmp13, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* %tmp, i32 7, double undef, i32 99, i8* null, i8* undef) + br label %bb26 + + bb26: ; preds = %bb15, %bb7 + %tmp18 = phi i64* [ %tmp8, %bb7 ], [ undef, %bb15 ] + %tmp20 = sub i32 0, 0 + %tmp21 = select i1 undef, i32 0, i32 %tmp20 + %tmp22 = sext i32 %tmp21 to i64 + %tmp23 = load i8*, i8** @global.1, align 8 + %tmp24 = icmp eq i8* %tmp23, null + %tmp25 = select i1 %tmp24, i8* null, i8* undef + %tmp27 = load i32, i32* undef, align 4 + %sunkaddr = mul i64 %tmp14, 8 + %tmp2 = bitcast i64* %tmp18 to i8* + %sunkaddr1 = getelementptr i8, i8* %tmp2, i64 %sunkaddr + %tmp3 = bitcast i8* %sunkaddr1 to i64* + %tmp28 = load i64, i64* %tmp3, align 8 + %tmp29 = add i64 %tmp28, 1 + store i64 %tmp29, i64* %tmp3, align 8 + %tmp30 = trunc i64 %tmp28 to i32 + %tmp31 = sub i32 %tmp27, %tmp30 + store i32 %tmp31, i32* undef, align 4 + %tmp32 = getelementptr inbounds i8, i8* %tmp25, i64 768 + %tmp33 = bitcast i8* %tmp32 to i64* + %tmp34 = load i64, i64* %tmp33, align 8 + br i1 undef, label %bb37, label %bb35 + + bb35: ; preds = %bb26 + %tmp36 = call i8* @ham(i8* undef, i8** nonnull @global) + br label %bb37 + + bb37: ; preds = %bb35, %bb26 + %tmp38 = phi i8* [ %tmp36, %bb35 ], [ undef, %bb26 ] + %tmp39 = getelementptr inbounds i8, i8* %tmp38, i64 760 + %tmp40 = bitcast i8* %tmp39 to i64* + %tmp41 = load i64, i64* %tmp40, align 8 + %tmp42 = icmp slt i64 %tmp34, %tmp41 + %tmp43 = select i1 %tmp42, i64 %tmp41, i64 %tmp34 + %tmp44 = and i64 %tmp43, 63 + %tmp45 = ashr i64 %tmp29, %tmp44 + %sunkaddr2 = mul i64 %tmp14, 8 + %tmp6 = bitcast i64* %tmp18 to i8* + %sunkaddr3 = getelementptr i8, i8* %tmp6, i64 %sunkaddr2 + %tmp7 = bitcast i8* %sunkaddr3 to i64* + store i64 %tmp45, i64* %tmp7, align 8 + %tmp46 = sub i64 0, %tmp22 + store i64 %tmp46, i64* undef, align 8 + %tmp47 = add nsw i32 %tmp12, 1 + %tmp48 = add i32 %tmp10, 1 + %tmp49 = icmp sgt i32 %tmp48, 15140 + br i1 %tmp49, label %bb51.loopexit, label %bb7 + + bb51.loopexit: ; preds = %bb37 + %tmp9 = add i32 %tmp10, 1 + br label %bb51 + + bb51: ; preds = %bb51.loopexit, %bb2 + %tmp52 = phi i32 [ %tmp47, %bb51.loopexit ], [ 0, %bb2 ] + %tmp53 = phi double [ 2.000000e+00, %bb51.loopexit ], [ 0.000000e+00, %bb2 ] + %tmp54 = phi i32 [ %tmp9, %bb51.loopexit ], [ undef, %bb2 ] + %tmp56 = add i32 %tmp54, 0 + %tmp57 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 -121, i32 0, i32 38, i32 1, i32 0, i32 270, i32 4, i32 12, i32 0, i32 11, i64 undef, i32 99, i8* null, i32 10, i32 %tmp56, i32 6, float undef, i32 99, i8* null, i32 99, i8* null, i32 10, i32 %tmp52, i32 10, i32 undef, i32 99, i8* null, i32 7, double %tmp53, i32 99, i8* null, i32 7, double undef, i32 99, i8* null, i32 13, i8* undef, i32 99, i8* null, i32 99, i8* null, i8* undef) + unreachable + + bb59: ; preds = %bb2 + %tmp61 = call token (i64, i32, void (i32)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64 2882400000, i32 0, void (i32)* nonnull @wobble, i32 1, i32 0, i32 8, i32 0, i32 38, i32 1, i32 0, i32 123, i32 4, i32 12, i32 0, i32 13, i8* null, i32 99, i32 undef, i32 13, i8* null, i32 10, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i32 undef, i32 99, i8* null, i32 99, float undef, i32 99, double undef, i32 99, i8* null, i32 99, double undef, i32 99, i8* null, i32 13, i8* null, i32 99, double undef, i32 99, i8* null) + unreachable + } + + declare void @wibble() + + declare void @wobble(i32) + + declare token @llvm.experimental.gc.statepoint.p0f_isVoidi32f(i64, i32, void (i32)*, i32, i32, ...) + + declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #0 + + attributes #0 = { nounwind } + + !0 = !{} +... +--- +name: eggs +alignment: 4 +tracksRegLiveness: true +fixedStack: + - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, callee-saved-register: '%rbx' } + - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, callee-saved-register: '%r12' } + - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '%r13' } + - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '%r14' } + - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%r15' } + - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%rbp' } +stack: + - { id: 0, offset: -88, size: 8, alignment: 8 } + - { id: 1, offset: -96, size: 8, alignment: 8 } + - { id: 2, offset: -104, size: 8, alignment: 8 } + - { id: 3, offset: -64, size: 8, alignment: 8 } + - { id: 4, type: spill-slot, offset: -72, size: 8, alignment: 8 } + - { id: 5, type: spill-slot, offset: -80, size: 8, alignment: 8 } +constants: + - id: 0 + value: 'double 2.000000e+00' + alignment: 8 +body: | + bb.0.bb: + successors: %bb.1.bb2(0x00000800), %bb.3.bb3(0x7ffff800) + liveins: %rbp, %r15, %r14, %r13, %r12, %rbx + + frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %r15, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %r14, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %r13, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %r12, implicit-def %rsp, implicit %rsp + frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp + %rsp = frame-setup SUB64ri8 %rsp, 56, implicit-def dead %eflags + CALL64r undef %rax, csr_64, implicit %rsp, implicit undef %rdi, implicit undef %rsi, implicit-def %rsp, implicit-def %rax + TEST64rr %rax, %rax, implicit-def %eflags + JNE_1 %bb.3.bb3, implicit killed %eflags + + bb.1.bb2: + successors: %bb.2(0x40000000), %bb.13.bb59(0x40000000) + + %ebp = XOR32rr undef %ebp, undef %ebp, implicit-def dead %eflags + TEST8rr %bpl, %bpl, implicit-def %eflags + JE_1 %bb.13.bb59, implicit killed %eflags + + bb.2: + successors: %bb.12.bb51(0x80000000) + liveins: %ebp + + %xmm0 = XORPSrr undef %xmm0, undef %xmm0 + %ebx = IMPLICIT_DEF implicit-def %rbx + JMP_1 %bb.12.bb51 + + bb.3.bb3: + successors: %bb.4.bb7(0x80000000) + liveins: %rax + + MOV64mr %rsp, 1, _, 32, _, %rax :: (store 8 into %stack.5) + %r12 = MOV64rr killed %rax + %r12 = ADD64ri8 killed %r12, 16, implicit-def dead %eflags + %xmm0 = XORPSrr undef %xmm0, undef %xmm0 + %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags + %rax = MOV64ri %const.0 + %xmm1 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool) + MOVSDmr %rsp, 1, _, 40, _, killed %xmm1 :: (store 8 into %stack.4) + %eax = IMPLICIT_DEF + %ecx = XOR32rr undef %ecx, undef %ecx, implicit-def dead %eflags + + bb.4.bb7: + successors: %bb.6.bb26(0x40000000), %bb.5.bb15(0x40000000) + liveins: %eax, %ecx, %esi, %r12, %xmm0 + + %ebp = MOV32rr killed %ecx + %ebx = MOV32rr killed %eax, implicit-def %rbx + %r14d = MOV32rr %ebx, implicit-def %r14 + TEST8rr %sil, %sil, implicit-def %eflags + JNE_1 %bb.6.bb26, implicit %eflags + + bb.5.bb15: + successors: %bb.6.bb26(0x80000000) + liveins: %ebp, %rbx, %r14, %xmm0 + + MOV32mr %rsp, 1, _, 24, _, %ebx :: (store 4 into %stack.0, align 8) + MOV32mr %rsp, 1, _, 16, _, %ebp :: (store 4 into %stack.1, align 8) + MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2) + %rax = MOV64rm %rsp, 1, _, 32, _ :: (load 8 from %stack.5) + MOV64mr %rsp, 1, _, 48, _, killed %rax :: (store 8 into %stack.3) + %rax = MOV64ri @wibble + STATEPOINT 2882400000, 0, 0, killed %rax, 2, 0, 2, 0, 2, 30, 2, 1, 2, 0, 2, 99, 2, 0, 2, 12, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 10, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 6, 2, 4278124286, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 1, 8, %rsp, 48, 2, 7, 2, 4278124286, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2), (volatile load 8 from %stack.3) + %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags + %r12 = IMPLICIT_DEF + + bb.6.bb26: + successors: %bb.8.bb37(0x40000000), %bb.7.bb35(0x40000000) + liveins: %ebp, %esi, %rbx, %r12, %r14 + + %rax = MOV64ri @global.1 + %rax = MOV64rm killed %rax, 1, _, 0, _ :: (dereferenceable load 8 from @global.1) + TEST64rr %rax, %rax, implicit-def %eflags + %rax = CMOVE64rr undef %rax, killed %rax, implicit killed %eflags + %ecx = MOV32rm undef %rax, 1, _, 0, _ :: (load 4 from `i32* undef`) + %rdx = MOV64rm %r12, 8, %r14, 0, _ :: (load 8 from %ir.tmp3) + %r15 = LEA64r %rdx, 1, _, 1, _ + MOV64mr %r12, 8, %r14, 0, _, %r15 :: (store 8 into %ir.tmp3) + %ecx = SUB32rr killed %ecx, %edx, implicit-def dead %eflags, implicit killed %rdx + MOV32mr undef %rax, 1, _, 0, _, killed %ecx :: (store 4 into `i32* undef`) + %r13 = MOV64rm killed %rax, 1, _, 768, _ :: (load 8 from %ir.tmp33) + TEST8rr %sil, %sil, implicit-def %eflags + %rax = IMPLICIT_DEF + JNE_1 %bb.8.bb37, implicit %eflags + + bb.7.bb35: + successors: %bb.8.bb37(0x80000000) + liveins: %ebp, %rbx, %r12, %r13, %r14, %r15 + + %rsi = MOV64ri @global + %rax = MOV64ri @ham + CALL64r killed %rax, csr_64, implicit %rsp, implicit undef %rdi, implicit %rsi, implicit-def %rsp, implicit-def %rax + %esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags + + bb.8.bb37: + successors: %bb.9.bb37(0x40000000), %bb.10.bb37(0x40000000) + liveins: %ebp, %esi, %rax, %rbx, %r12, %r13, %r14, %r15 + + %rcx = MOV64rm killed %rax, 1, _, 760, _ :: (load 8 from %ir.tmp40) + CMP64rr %r13, %rcx, implicit-def %eflags + JL_1 %bb.10.bb37, implicit %eflags + + bb.9.bb37: + successors: %bb.10.bb37(0x80000000) + liveins: %ebp, %esi, %rbx, %r12, %r13, %r14, %r15 + + %cl = MOV8rr %r13b, implicit killed %r13, implicit-def %rcx + + bb.10.bb37: + successors: %bb.11.bb51.loopexit(0x00000800), %bb.4.bb7(0x7ffff800) + liveins: %ebp, %esi, %rbx, %rcx, %r12, %r14, %r15 + + %cl = KILL %cl, implicit killed %rcx + %r15 = SAR64rCL killed %r15, implicit-def dead %eflags, implicit %cl + MOV64mr %r12, 8, killed %r14, 0, _, killed %r15 :: (store 8 into %ir.tmp7) + MOV64mi32 undef %rax, 1, _, 0, _, 0 :: (store 8 into `i64* undef`) + %eax = LEA64_32r %rbx, 1, _, 1, _ + %ecx = MOV32ri 6 + CMP32ri %eax, 15141, implicit-def %eflags + %xmm0 = MOVSDrm %rsp, 1, _, 40, _ :: (load 8 from %stack.4) + JL_1 %bb.4.bb7, implicit %eflags + + bb.11.bb51.loopexit: + successors: %bb.12.bb51(0x80000000) + liveins: %ebp, %rbx + + %ebp = INC32r killed %ebp, implicit-def dead %eflags + %ebx = INC32r %ebx, implicit-def dead %eflags, implicit killed %rbx, implicit-def %rbx + %rax = MOV64ri %const.0 + %xmm0 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool) + + bb.12.bb51: + liveins: %ebp, %rbx, %xmm0 + + MOV32mr %rsp, 1, _, 24, _, %ebx, implicit killed %rbx :: (store 4 into %stack.0, align 8) + MOV32mr %rsp, 1, _, 16, _, killed %ebp :: (store 4 into %stack.1, align 8) + MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2) + %rax = MOV64ri @wobble + %edi = MOV32ri -121 + STATEPOINT 2882400000, 0, 1, killed %rax, %edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 270, 2, 4, 2, 12, 2, 0, 2, 11, 2, 4278124286, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 6, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 99, 2, 0, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2) + + bb.13.bb59: + %rax = MOV64ri @wobble + %edi = MOV32ri 8 + STATEPOINT 2882400000, 0, 1, killed %rax, %edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 123, 2, 4, 2, 12, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 13, 2, 0, 2, 10, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 0, 2, 99, 2, 4278124286, 2, 99, 2, 0, csr_64, implicit-def %rsp + +... Index: test/CodeGen/X86/vector-shuffle-v48.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v48.ll +++ test/CodeGen/X86/vector-shuffle-v48.ll @@ -3,42 +3,18 @@ define <32 x i8> @foo(<48 x i8>* %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: foo: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqu 32(%rdi), %xmm0 -; CHECK-NEXT: vmovdqu (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpextrb $0, %xmm2, %eax -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero -; CHECK-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpextrb $2, %xmm2, %eax -; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpextrb $3, %xmm2, %eax -; CHECK-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpextrb $5, %xmm2, %eax -; CHECK-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpextrb $6, %xmm2, %eax -; CHECK-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpextrb $1, %xmm0, %eax -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $2, %xmm0, %eax -; CHECK-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $4, %xmm0, %eax -; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $5, %xmm0, %eax -; CHECK-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $7, %xmm0, %eax -; CHECK-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $8, %xmm0, %eax -; CHECK-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $10, %xmm0, %eax -; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $11, %xmm0, %eax -; CHECK-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $13, %xmm0, %eax -; CHECK-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrb $14, %xmm0, %eax -; CHECK-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu 32(%rdi), %xmm1 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero +; CHECK-NEXT: vpor %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0] +; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %x0, align 1 %2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> Index: test/CodeGen/X86/xray-attribute-instrumentation.ll =================================================================== --- test/CodeGen/X86/xray-attribute-instrumentation.ll +++ test/CodeGen/X86/xray-attribute-instrumentation.ll @@ -14,17 +14,16 @@ ; CHECK-NEXT: nopw %cs:512(%rax,%rax) } ; CHECK: .p2align 4, 0x90 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_0 ; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_0 ; CHECK-NEXT: .section {{.*}}xray_instr_map -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .quad {{.*}}xray_sled_0 ; CHECK: .quad {{.*}}xray_sled_1 -; CHECK-LABEL: Lxray_synthetic_end0: +; CHECK-LABEL: Lxray_sleds_end0: ; CHECK: .section {{.*}}xray_fn_idx ; CHECK-LABEL: Lxray_fn_idx_synth_0: -; CHECK: .quad {{.*}}xray_synthetic_0 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_end0 +; CHECK: .quad {{.*}}xray_sleds_start0 +; CHECK-NEXT: .quad {{.*}}xray_sleds_end0 ; We test multiple returns in a single function to make sure we're getting all @@ -52,15 +51,14 @@ ; CHECK-NEXT: nopw %cs:512(%rax,%rax) } ; CHECK: .p2align 4, 0x90 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_1 ; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_1 ; CHECK-NEXT: .section {{.*}}xray_instr_map -; CHECK-LABEL: Lxray_synthetic_1: +; CHECK-LABEL: Lxray_sleds_start1: ; CHECK: .quad {{.*}}xray_sled_2 ; CHECK: .quad {{.*}}xray_sled_3 ; CHECK: .quad {{.*}}xray_sled_4 -; CHECK-LABEL: Lxray_synthetic_end1: +; CHECK-LABEL: Lxray_sleds_end1: ; CHECK: .section {{.*}}xray_fn_idx ; CHECK-LABEL: Lxray_fn_idx_synth_1: -; CHECK: .quad {{.*}}xray_synthetic_1 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_end1 +; CHECK: .quad {{.*}}xray_sleds_start1 +; CHECK-NEXT: .quad {{.*}}xray_sleds_end1 Index: test/CodeGen/X86/xray-custom-log.ll =================================================================== --- test/CodeGen/X86/xray-custom-log.ll +++ test/CodeGen/X86/xray-custom-log.ll @@ -17,7 +17,7 @@ ret i32 0 } ; CHECK: .section {{.*}}xray_instr_map -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .quad {{.*}}xray_event_sled_0 declare void @llvm.xray.customevent(i8*, i32) Index: test/CodeGen/X86/xray-log-args.ll =================================================================== --- test/CodeGen/X86/xray-log-args.ll +++ test/CodeGen/X86/xray-log-args.ll @@ -6,7 +6,7 @@ define i32 @callee(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" { ret i32 %arg } -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .quad {{\.?}}Lxray_sled_0 ; CHECK: .quad {{_?}}callee ; CHECK: .byte 3 @@ -22,7 +22,7 @@ %retval = tail call i32 @callee(i32 %arg) ret i32 %retval } -; CHECK-LABEL: Lxray_synthetic_1: +; CHECK-LABEL: Lxray_sleds_start1: ; CHECK: .quad {{\.?}}Lxray_sled_2 ; CHECK: .quad {{_?}}caller ; CHECK: .byte 3 Index: test/CodeGen/X86/xray-tail-call-sled.ll =================================================================== --- test/CodeGen/X86/xray-tail-call-sled.ll +++ test/CodeGen/X86/xray-tail-call-sled.ll @@ -14,17 +14,16 @@ ; CHECK-NEXT: nopw %cs:512(%rax,%rax) } ; CHECK: .p2align 4, 0x90 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_0{{.*}} ; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_0{{.*}} ; CHECK-NEXT: .section {{.*}}xray_instr_map -; CHECK-LABEL: Lxray_synthetic_0: +; CHECK-LABEL: Lxray_sleds_start0: ; CHECK: .quad {{.*}}xray_sled_0 ; CHECK: .quad {{.*}}xray_sled_1 -; CHECK-LABEL: Lxray_synthetic_end0: +; CHECK-LABEL: Lxray_sleds_end0: ; CHECK-NEXT: .section {{.*}}xray_fn_idx ; CHECK-LABEL: Lxray_fn_idx_synth_0: -; CHECK: .quad {{.*}}xray_synthetic_0 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_end0 +; CHECK: .quad {{.*}}xray_sleds_start0 +; CHECK-NEXT: .quad {{.*}}xray_sleds_end0 define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" { ; CHECK: .p2align 1, 0x90 @@ -42,13 +41,12 @@ ret i32 %retval } ; CHECK: .p2align 4, 0x90 -; CHECK-NEXT: .quad {{.*}}xray_synthetic_1{{.*}} ; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_1{{.*}} -; CHECK-LABEL: Lxray_synthetic_1: +; CHECK-LABEL: Lxray_sleds_start1: ; CHECK: .quad {{.*}}xray_sled_2 ; CHECK: .quad {{.*}}xray_sled_3 -; CHECK-LABEL: Lxray_synthetic_end1: +; CHECK-LABEL: Lxray_sleds_end1: ; CHECK: .section {{.*}}xray_fn_idx ; CHECK-LABEL: Lxray_fn_idx_synth_1: -; CHECK: .quad {{.*}}xray_synthetic_1 -; CHECK: .quad {{.*}}xray_synthetic_end1 +; CHECK: .quad {{.*}}xray_sleds_start1 +; CHECK: .quad {{.*}}xray_sleds_end1 Index: test/DebugInfo/COFF/globals.ll =================================================================== --- test/DebugInfo/COFF/globals.ll +++ test/DebugInfo/COFF/globals.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s | FileCheck %s --check-prefix=ASM ; RUN: llc < %s -filetype=obj | llvm-readobj - -codeview | FileCheck %s --check-prefix=OBJ +; RUN: llc < %s -filetype=obj | obj2yaml | FileCheck %s --check-prefix=YAML ; C++ source to regenerate: ; $ cat t.cpp @@ -109,6 +110,43 @@ ; OBJ: ] ; OBJ: ] +; YAML-LABEL: - Name: '.debug$S' +; YAML: Subsections: +; YAML: - !Symbols +; YAML: Records: +; YAML: - Kind: S_COMPILE3 +; YAML: Compile3Sym: +; YAML: - !Symbols +; YAML: Records: +; YAML: - Kind: S_LDATA32 +; YAML: DataSym: +; YAML-NOT: Segment +; YAML: Type: 116 +; YAML-NOT: Segment +; YAML: DisplayName: first +; YAML-NOT: Segment +; YAML: - Kind: S_GTHREAD32 +; YAML: ThreadLocalDataSym: +; YAML: Type: 4097 +; YAML: DisplayName: middle +; YAML: - Kind: S_GDATA32 +; YAML: DataSym: +; YAML-NOT: Segment +; YAML: Type: 116 +; YAML-NOT: Offset +; YAML-NOT: Segment +; YAML: DisplayName: last +; YAML-NOT: Segment + +; The missing offsets are represented as relocations against this section. +; YAML: Relocations: +; YAML: - VirtualAddress: 92 +; YAML: SymbolName: '?first@@3HA' +; YAML: Type: IMAGE_REL_AMD64_SECREL +; YAML: - VirtualAddress: 96 +; YAML: SymbolName: '?first@@3HA' +; YAML: Type: IMAGE_REL_AMD64_SECTION + ; ModuleID = 't.cpp' source_filename = "t.cpp" target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" Index: test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir =================================================================== --- /dev/null +++ test/DebugInfo/MIR/AArch64/implicit-def-dead-scope.mir @@ -0,0 +1,249 @@ +# RUN: llc -start-after=livedebugvalues -filetype=obj -o - %s \ +# RUN: | llvm-dwarfdump - | FileCheck %s + +# This tests for a crash in DwarfDebug's singular DBG_VALUE range promotion when +# encountering an IMPLICIT_DEF in its own lexical scope. + +# CHECK: .debug_info contents: +# CHECK: DW_TAG_formal_parameter +# CHECK: DW_AT_const_value [DW_FORM_udata] (0) +--- | + ; ModuleID = 't.ll' + source_filename = "t.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64--linux-gnu" + + %class.v = type <{ i32, i8, i8, [2 x i8] }> + %class.j = type <{ %"class.j<6, a::f>::D", i32, [4 x i8] }> + %"class.j<6, a::f>::D" = type { %"class.j<6, a::f>::p" } + %"class.j<6, a::f>::p" = type { i64 } + + @bt = global i32 0, align 4 + + define void @_ZN1v2bvEv(%class.v* nocapture readonly %this) local_unnamed_addr align 2 !dbg !14 { + entry: + %bz = alloca %class.j, align 8 + %att = alloca %class.j, align 8 + %ap = getelementptr inbounds %class.v, %class.v* %this, i64 0, i32 1 + %0 = load i8, i8* %ap, align 4 + %conv = sext i8 %0 to i32 + switch i32 %conv, label %sw.epilog [ + i32 1, label %_ZN1jILi6EN1a1fEE1mEj.exit + i32 0, label %sw.bb2 + ] + + _ZN1jILi6EN1a1fEE1mEj.exit: ; preds = %entry + %1 = bitcast %class.j* %att to i64* + %2 = bitcast %class.j* %bz to i64* + store i64 1, i64* %2, align 8 + call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !18, metadata !23), !dbg !24 + store i64 1, i64* %1, align 8, !dbg !27 + br label %sw.epilog + + sw.bb2: ; preds = %entry + %3 = bitcast %class.j* %att to i64* + %4 = bitcast %class.j* %bz to i64* + %.pre = load i64, i64* %3, align 8 + %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i13.phi.trans.insert = getelementptr inbounds %class.j, %class.j* %bz, i64 0, i32 1 + %.phi.trans.insert = bitcast i32* %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i13.phi.trans.insert to i64* + %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14.pre = load i64, i64* %.phi.trans.insert, align 8 + %.pre25 = load i64, i64* %4, align 8 + %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i.phi.trans.insert = getelementptr inbounds %class.j, %class.j* %att, i64 0, i32 1 + %.phi.trans.insert26 = bitcast i32* %agg.tmp.sroa.2.0..sroa_idx1.i.i.i.i.i.i.phi.trans.insert to i64* + %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i.pre = load i64, i64* %.phi.trans.insert26, align 8 + br label %sw.epilog + + sw.epilog: ; preds = %sw.bb2, %_ZN1jILi6EN1a1fEE1mEj.exit, %entry + %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i = phi i64 [ %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i.pre, %sw.bb2 ], [ undef, %entry ], [ undef, %_ZN1jILi6EN1a1fEE1mEj.exit ], !dbg !32 + %5 = phi i64 [ %.pre25, %sw.bb2 ], [ 0, %entry ], [ 1, %_ZN1jILi6EN1a1fEE1mEj.exit ] + %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14 = phi i64 [ %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14.pre, %sw.bb2 ], [ undef, %entry ], [ undef, %_ZN1jILi6EN1a1fEE1mEj.exit ] + %6 = phi i64 [ %.pre, %sw.bb2 ], [ 0, %entry ], [ 1, %_ZN1jILi6EN1a1fEE1mEj.exit ] + %bw1 = bitcast %class.v* %this to i32* + %7 = load i32, i32* %bw1, align 4 + %bx = getelementptr inbounds %class.v, %class.v* %this, i64 0, i32 2 + %8 = load i8, i8* %bx, align 1 + %tobool = icmp ne i8 %8, 0 + %.fca.0.insert9 = insertvalue [2 x i64] undef, i64 %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i14, 0 + %.fca.1.insert12 = insertvalue [2 x i64] %.fca.0.insert9, i64 %5, 1 + %.fca.0.insert = insertvalue [2 x i64] undef, i64 %agg.tmp.sroa.2.0.copyload2.i.i6.i.i.i.i, 0 + %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %6, 1 + call void @_Z2byi1LS_bbPi(i32 %7, [2 x i64] %.fca.1.insert12, [2 x i64] %.fca.1.insert, i1 %tobool, i1 false, i32* nonnull @bt) + ret void + } + + declare void @_Z2byi1LS_bbPi(i32, [2 x i64], [2 x i64], i1, i1, i32*) local_unnamed_addr + + ; Function Attrs: nounwind readnone speculatable + declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0 + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #1 + + attributes #0 = { nounwind readnone speculatable } + attributes #1 = { nounwind } + + !llvm.dbg.cu = !{!0} + !llvm.module.flags = !{!12, !13} + + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 305696) (llvm/trunk 305708)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2) + !1 = !DIFile(filename: "/", directory: "/") + !2 = !{} + !3 = !{!4, !10} + !4 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "r", file: !5, line: 50, size: 8, elements: !6, identifier: "_ZTS1r") + !5 = !DIFile(filename: "current.ii", directory: "/") + !6 = !{!7} + !7 = !DISubprogram(name: "r", scope: !4, file: !5, line: 52, type: !8, isLocal: false, isDefinition: false, scopeLine: 52, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true) + !8 = !DISubroutineType(types: !9) + !9 = !{null} + !10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "q", scope: !11, file: !5, line: 39, size: 64, elements: !2, identifier: "_ZTSN1jILi6EN1a1fEE1qE") + !11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "j<6, a::f>", file: !5, line: 7, size: 128, elements: !2, templateParams: !2, identifier: "_ZTS1jILi6EN1a1fEE") + !12 = !{i32 2, !"Debug Info Version", i32 3} + !13 = !{i32 1, !"wchar_size", i32 4} + !14 = distinct !DISubprogram(name: "bv", linkageName: "_ZN1v2bvEv", scope: !15, file: !5, line: 104, type: !16, isLocal: false, isDefinition: true, scopeLine: 104, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !17, variables: !2) + !15 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "v", file: !5, line: 97, size: 64, elements: !2, identifier: "_ZTS1v") + !16 = !DISubroutineType(types: !2) + !17 = !DISubprogram(name: "bv", linkageName: "_ZN1v2bvEv", scope: !15, file: !5, line: 98, type: !16, isLocal: false, isDefinition: false, scopeLine: 98, flags: DIFlagPrototyped, isOptimized: true) + !18 = !DILocalVariable(arg: 2, scope: !19, file: !5, line: 22, type: !21) + !19 = distinct !DISubprogram(name: "m", linkageName: "_ZN1jILi6EN1a1fEE1mEj", scope: !11, file: !5, line: 22, type: !16, isLocal: false, isDefinition: true, scopeLine: 22, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !20, variables: !2) + !20 = !DISubprogram(name: "m", linkageName: "_ZN1jILi6EN1a1fEE1mEj", scope: !11, file: !5, line: 22, type: !16, isLocal: false, isDefinition: false, scopeLine: 22, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true) + !21 = !DIDerivedType(tag: DW_TAG_typedef, name: "h", file: !5, line: 10, baseType: !22) + !22 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) + !23 = !DIExpression() + !24 = !DILocation(line: 22, scope: !19, inlinedAt: !25) + !25 = distinct !DILocation(line: 109, scope: !26) + !26 = distinct !DILexicalBlock(scope: !14, file: !5, line: 106) + !27 = !DILocation(line: 29, scope: !28, inlinedAt: !31) + !28 = distinct !DISubprogram(name: "n", linkageName: "_ZN1jILi6EN1a1fEE1p1nEl", scope: !29, file: !5, line: 29, type: !8, isLocal: false, isDefinition: true, scopeLine: 29, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !30, variables: !2) + !29 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "p", scope: !11, file: !5, line: 26, size: 64, elements: !2, identifier: "_ZTSN1jILi6EN1a1fEE1pE") + !30 = !DISubprogram(name: "n", linkageName: "_ZN1jILi6EN1a1fEE1p1nEl", scope: !29, file: !5, line: 29, type: !8, isLocal: false, isDefinition: false, scopeLine: 29, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true) + !31 = distinct !DILocation(line: 24, scope: !19, inlinedAt: !25) + !32 = !DILocation(line: 61, scope: !33, inlinedAt: !38) + !33 = distinct !DISubprogram(name: "bc >", linkageName: "_ZN1s2bcI1jILi6EN1a1fEEEEDTcl2badeclsr1aE2aaIPT_EEEES6_", scope: !34, file: !5, line: 60, type: !16, isLocal: false, isDefinition: true, scopeLine: 60, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !35, variables: !2) + !34 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", file: !5, line: 59, size: 8, elements: !2, identifier: "_ZTS1s") + !35 = !DISubprogram(name: "bc >", linkageName: "_ZN1s2bcI1jILi6EN1a1fEEEEDTcl2badeclsr1aE2aaIPT_EEEES6_", scope: !34, file: !5, line: 60, type: !16, isLocal: false, isDefinition: false, scopeLine: 60, flags: DIFlagPrototyped, isOptimized: true, templateParams: !36) + !36 = !{!37} + !37 = !DITemplateTypeParameter(name: "ay", type: !11) + !38 = distinct !DILocation(line: 70, scope: !39, inlinedAt: !42) + !39 = distinct !DISubprogram(name: "bc", linkageName: "_ZN1JI1s1jILi6EN1a1fEEE2bcEPS4_", scope: !40, file: !5, line: 70, type: !16, isLocal: false, isDefinition: true, scopeLine: 70, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !41, variables: !2) + !40 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "J >", file: !5, line: 69, size: 8, elements: !2, templateParams: !2, identifier: "_ZTS1JI1s1jILi6EN1a1fEEE") + !41 = !DISubprogram(name: "bc", linkageName: "_ZN1JI1s1jILi6EN1a1fEEE2bcEPS4_", scope: !40, file: !5, line: 70, type: !16, isLocal: false, isDefinition: false, scopeLine: 70, flags: DIFlagPrototyped, isOptimized: true) + !42 = distinct !DILocation(line: 85, scope: !43, inlinedAt: !46) + !43 = distinct !DISubprogram(name: "u >", linkageName: "_ZN1uC2I1jILi6EN1a1fEEEERT_", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !45, variables: !2) + !44 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "u", file: !5, line: 82, size: 128, elements: !2, identifier: "_ZTS1u") + !45 = !DISubprogram(name: "u >", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: false, scopeLine: 85, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !36) + !46 = distinct !DILocation(line: 85, scope: !47, inlinedAt: !48) + !47 = distinct !DISubprogram(name: "u >", linkageName: "_ZN1uC1I1jILi6EN1a1fEEEERT_", scope: !44, file: !5, line: 85, type: !16, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !36, declaration: !45, variables: !2) + !48 = distinct !DILocation(line: 92, scope: !49, inlinedAt: !52) + !49 = distinct !DISubprogram(name: "L >", linkageName: "_ZN1LC2I1jILi6EN1a1fEEEERT_", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: true, scopeLine: 92, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !2, declaration: !51, variables: !2) + !50 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "L", file: !5, line: 88, size: 128, elements: !2, identifier: "_ZTS1L") + !51 = !DISubprogram(name: "L >", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: false, scopeLine: 92, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !2) + !52 = distinct !DILocation(line: 92, scope: !53, inlinedAt: !54) + !53 = distinct !DISubprogram(name: "L >", linkageName: "_ZN1LC1I1jILi6EN1a1fEEEERT_", scope: !50, file: !5, line: 92, type: !16, isLocal: false, isDefinition: true, scopeLine: 92, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !2, declaration: !51, variables: !2) + !54 = distinct !DILocation(line: 114, scope: !14) + +... +--- +name: _ZN1v2bvEv +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%x0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 48 + offsetAdjustment: 0 + maxAlignment: 16 + adjustsStack: true + hasCalls: true + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: bz, type: default, offset: -32, size: 16, alignment: 8, + callee-saved-register: '', local-offset: -16, di-variable: '', di-expression: '', + di-location: '' } + - { id: 1, name: att, type: default, offset: -48, size: 16, alignment: 8, + callee-saved-register: '', local-offset: -32, di-variable: '', di-expression: '', + di-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, + callee-saved-register: '%lr', di-variable: '', di-expression: '', + di-location: '' } +constants: +body: | + bb.0.entry: + successors: %bb.3.sw.bb2(0x2aaaaaab), %bb.1.entry(0x55555555) + liveins: %x0, %lr + + %sp = frame-setup SUBXri %sp, 48, 0 + frame-setup STRXui killed %lr, %sp, 4 :: (store 8 into %stack.2) + frame-setup CFI_INSTRUCTION def_cfa_offset 48 + frame-setup CFI_INSTRUCTION offset %w30, -16 + %w8 = LDRSBWui %x0, 4 :: (load 1 from %ir.ap, align 4) + CBZW %w8, %bb.3.sw.bb2 + + bb.1.entry: + successors: %bb.2._ZN1jILi6EN1a1fEE1mEj.exit(0x40000001), %bb.4(0x3fffffff) + liveins: %w8, %x0 + + dead %wzr = SUBSWri killed %w8, 1, 0, implicit-def %nzcv + Bcc 1, %bb.4, implicit %nzcv + + bb.2._ZN1jILi6EN1a1fEE1mEj.exit: + successors: %bb.5.sw.epilog(0x80000000) + liveins: %x0 + + %w2 = ORRWri %wzr, 0, implicit-def %x2 + %x3 = IMPLICIT_DEF debug-location !32 + %x1 = IMPLICIT_DEF + STRXui %x2, %sp, 2 :: (store 8 into %ir.2) + DBG_VALUE 0, 0, !18, !23, debug-location !24 + STRXui %x2, %sp, 0, debug-location !27 :: (store 8 into %ir.1) + %w4 = ORRWri %wzr, 0, implicit-def %x4 + B %bb.5.sw.epilog + + bb.3.sw.bb2: + successors: %bb.5.sw.epilog(0x80000000) + liveins: %x0 + + %x4, %x3 = LDPXi %sp, 0 :: (dereferenceable load 8 from %ir.3), (dereferenceable load 8 from %ir..phi.trans.insert26) + %x2, %x1 = LDPXi %sp, 2 :: (dereferenceable load 8 from %ir..phi.trans.insert), (dereferenceable load 8 from %ir.4) + B %bb.5.sw.epilog + + bb.4: + successors: %bb.5.sw.epilog(0x80000000) + liveins: %x0 + + %x2 = ORRXrs %xzr, %xzr, 0 + %x4 = ORRXrs %xzr, %xzr, 0 + %x3 = IMPLICIT_DEF debug-location !32 + %x1 = IMPLICIT_DEF + + bb.5.sw.epilog: + liveins: %x0, %x1, %x2, %x3, %x4 + + %w8 = LDRBBui %x0, 5 :: (load 1 from %ir.bx) + %w0 = LDRWui killed %x0, 0 :: (load 4 from %ir.bw1) + %x7 = ADRP target-flags(aarch64-page) @bt + %x7 = ADDXri killed %x7, target-flags(aarch64-pageoff, aarch64-nc) @bt, 0 + dead %wzr = SUBSWri killed %w8, 0, 0, implicit-def %nzcv + %w5 = CSINCWr %wzr, %wzr, 0, implicit killed %nzcv + %w6 = ORRWrs %wzr, %wzr, 0 + BL @_Z2byi1LS_bbPi, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit killed %w0, implicit killed %x1, implicit killed %x2, implicit killed %x3, implicit killed %x4, implicit killed %w5, implicit killed %w6, implicit killed %x7, implicit-def %sp + %lr = LDRXui %sp, 4 :: (load 8 from %stack.2) + %sp = ADDXri %sp, 48, 0 + RET undef %lr + +... Index: test/DebugInfo/PDB/pdb-yaml-symbols.test =================================================================== --- test/DebugInfo/PDB/pdb-yaml-symbols.test +++ test/DebugInfo/PDB/pdb-yaml-symbols.test @@ -55,6 +55,7 @@ YAML: DbgStart: 3 YAML: DbgEnd: 8 YAML: FunctionType: 4097 +YAML: Offset: 16 YAML: Segment: 1 YAML: Flags: [ HasFP ] YAML: DisplayName: main @@ -178,4 +179,4 @@ YAML: Length: 8 YAML: Characteristics: 1107296320 YAML: Name: .reloc -YAML: ... \ No newline at end of file +YAML: ... Index: test/MC/AMDGPU/flat-global.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/flat-global.s @@ -0,0 +1,87 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding 2>&1 %s | FileCheck -check-prefix=GFX9-ERR -check-prefix=GCNERR %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding 2>&1 %s | FileCheck -check-prefix=VI-ERR -check-prefix=GCNERR %s + +global_load_ubyte v1, v[3:4] +// GFX9: global_load_ubyte v1, v[3:4] ; encoding: [0x00,0x80,0x40,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_sbyte v1, v[3:4] +// GFX9: global_load_sbyte v1, v[3:4] ; encoding: [0x00,0x80,0x44,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_ushort v1, v[3:4] +// GFX9: global_load_ushort v1, v[3:4] ; encoding: [0x00,0x80,0x48,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_sshort v1, v[3:4] +// GFX9: global_load_sshort v1, v[3:4] ; encoding: [0x00,0x80,0x4c,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_dword v1, v[3:4] +// GFX9: global_load_dword v1, v[3:4] ; encoding: [0x00,0x80,0x50,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_dwordx2 v[1:2], v[3:4] +// GFX9: global_load_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x80,0x54,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_dwordx3 v[1:3], v[3:4] +// GFX9: global_load_dwordx3 v[1:3], v[3:4] ; encoding: [0x00,0x80,0x58,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU + +global_load_dwordx4 v[1:4], v[3:4] +// GFX9: global_load_dwordx4 v[1:4], v[3:4] ; encoding: [0x00,0x80,0x5c,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: instruction not supported on this GPU +// FIXME: VI error should be instruction nto supported +global_load_dword v1, v[3:4] offset:0 +// GFX9: global_load_dword v1, v[3:4] ; encoding: [0x00,0x80,0x50,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: :36: error: not a valid operand. + +global_load_dword v1, v[3:4] offset:4095 +// GFX9: global_load_dword v1, v[3:4] offset:4095 ; encoding: [0xff,0x8f,0x50,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: :36: error: not a valid operand. + +global_load_dword v1, v[3:4] offset:-1 +// GFX9: global_load_dword v1, v[3:4] offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: :36: error: not a valid operand. + +global_load_dword v1, v[3:4] offset:-4096 +// GFX9: global_load_dword v1, v[3:4] offset:-4096 ; encoding: [0x00,0x90,0x50,0xdc,0x03,0x00,0x00,0x01] +// VI-ERR: :36: error: not a valid operand. + +global_load_dword v1, v[3:4] offset:4096 +// GFX9-ERR: :30: error: invalid operand for instruction +// VI-ERR: :36: error: not a valid operand. + +global_load_dword v1, v[3:4] offset:-4097 +// GFX9-ERR: :30: error: invalid operand for instruction +// VI-ERR: :36: error: not a valid operand. + +global_store_byte v[3:4], v1 +// GFX9: global_store_byte v[3:4], v1 ; encoding: [0x00,0x80,0x60,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_short v[3:4], v1 +// GFX9: global_store_short v[3:4], v1 ; encoding: [0x00,0x80,0x68,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_dword v[3:4], v1 +// GFX9: global_store_dword v[3:4], v1 ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_dwordx2 v[3:4], v[1:2] +// GFX9: global_store_dwordx2 v[3:4], v[1:2] ; encoding: [0x00,0x80,0x74,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_dwordx3 v[3:4], v[1:3] +// GFX9: global_store_dwordx3 v[3:4], v[1:3] ; encoding: [0x00,0x80,0x78,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_dwordx4 v[3:4], v[1:4] +// GFX9: global_store_dwordx4 v[3:4], v[1:4] ; encoding: [0x00,0x80,0x7c,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: instruction not supported on this GPU + +global_store_dword v[3:4], v1 offset:12 +// GFX9: global_store_dword v[3:4], v1 offset:12 ; encoding: [0x0c,0x80,0x70,0xdc,0x03,0x01,0x00,0x00] +// VI-ERR: :37: error: not a valid operand Index: test/MC/AMDGPU/mtbuf.s =================================================================== --- /dev/null +++ test/MC/AMDGPU/mtbuf.s @@ -0,0 +1,36 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICI %s +// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICI %s +// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +//===----------------------------------------------------------------------===// +// Test for dfmt and nfmt (tbuffer only) +//===----------------------------------------------------------------------===// + +tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7b,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7c,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7d,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x01,0x01] +// VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] + +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] + Index: test/MC/ARM/elf-movt.s =================================================================== --- test/MC/ARM/elf-movt.s +++ test/MC/ARM/elf-movt.s @@ -1,6 +1,6 @@ @ RUN: llvm-mc %s -triple=armv7-linux-gnueabi | FileCheck -check-prefix=ASM %s -@ RUN: llvm-mc %s -triple=armv7-linux-gnueabi -filetype=obj -o - | \ -@ RUN: llvm-readobj -s -sd -sr | FileCheck -check-prefix=OBJ %s +@ RUN: llvm-mc %s -triple=armv7-linux-gnueabi -filetype=obj -o %t.o +@ RUN: llvm-objdump -d -r %t.o -triple=armv7-linux-gnueabi | FileCheck -check-prefix=OBJ %s .syntax unified .text .globl barf @@ -14,41 +14,9 @@ @ ASM: movw r0, :lower16:(GOT-(.LPC0_2+8)) @ ASM-NEXT: movt r0, :upper16:(GOT-(.LPC0_2+8)) -@@ make sure that the text section fixups are sane too -@ OBJ: Section { -@ OBJ: Name: .text -@ OBJ-NEXT: Type: SHT_PROGBITS -@ OBJ-NEXT: Flags [ (0x6) -@ OBJ-NEXT: SHF_ALLOC -@ OBJ-NEXT: SHF_EXECINSTR -@ OBJ-NEXT: ] -@ OBJ-NEXT: Address: 0x0 -@ OBJ-NEXT: Offset: 0x34 -@ OBJ-NEXT: Size: 8 -@ OBJ-NEXT: Link: 0 -@ OBJ-NEXT: Info: 0 -@ OBJ-NEXT: AddressAlignment: 4 -@ OBJ-NEXT: EntrySize: 0 -@ OBJ-NEXT: Relocations [ -@ OBJ-NEXT: ] -@ OBJ-NEXT: SectionData ( -@ OBJ-NEXT: 0000: F00F0FE3 F40F4FE3 -@ OBJ-NEXT: ) -@ OBJ-NEXT: } -@ OBJ: Section { -@ OBJ: Index: -@ OBJ: Name: .rel.text -@ OBJ-NEXT: Type: SHT_REL (0x9) -@ OBJ-NEXT: Flags [ (0x0) -@ OBJ-NEXT: ] -@ OBJ-NEXT: Address: 0x0 -@ OBJ-NEXT: Offset: -@ OBJ-NEXT: Size: 16 -@ OBJ-NEXT: Link: -@ OBJ-NEXT: Info: -@ OBJ-NEXT: AddressAlignment: 4 -@ OBJ-NEXT: EntrySize: 8 -@ OBJ-NEXT: Relocations [ -@ OBJ-NEXT: 0x0 R_ARM_MOVW_PREL_NC GOT 0x0 -@ OBJ-NEXT: 0x4 R_ARM_MOVT_PREL GOT 0x0 -@ OBJ-NEXT: ] +@OBJ: Disassembly of section .text: +@OBJ-NEXT: barf: +@OBJ-NEXT: 0: f0 0f 0f e3 movw r0, #65520 +@OBJ-NEXT: 00000000: R_ARM_MOVW_PREL_NC GOT +@OBJ-NEXT: 4: f4 0f 4f e3 movt r0, #65524 +@OBJ-NEXT: 00000004: R_ARM_MOVT_PREL GOT Index: test/MC/Disassembler/AMDGPU/mtbuf_vi.txt =================================================================== --- /dev/null +++ test/MC/Disassembler/AMDGPU/mtbuf_vi.txt @@ -0,0 +1,22 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI + +# VI: tbuffer_load_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x01,0x01] +0x00 0x00 0x78 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_load_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x01,0x01,0x01] +0x00 0x80 0x78 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_load_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x01,0x01,0x01] +0x00 0x80 0x79 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_x v1, off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x01,0x01] +0x00 0x00 0x7a 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xy v[1:2], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x01,0x01,0x01] +0x00 0x80 0x7a 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xyzw v[1:4], off, s[4:7], dfmt:15, nfmt:2, s1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x01,0x01] +0x00 0x80 0x7b 0xe9 0x00 0x01 0x01 0x01 + +# VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +0x00 0x80 0x7b 0xe9 0x00 0x01 0x1d 0x71 Index: test/MC/ELF/bad-expr2.s =================================================================== --- test/MC/ELF/bad-expr2.s +++ test/MC/ELF/bad-expr2.s @@ -1,11 +1,10 @@ // RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o /dev/null \ // RUN: 2>&1 | FileCheck %s -// CHECK: No relocation available to represent this relative expression -// CHECK: call foo - bar - - +// CHECK: [[@LINE+2]]:{{[0-9]+}}: error: No relocation available to represent this relative expression +// CHECK-NEXT: call foo - bar call foo - bar + .section .foo foo: .section .bar Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll =================================================================== --- test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -6,9 +6,47 @@ declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp2( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* +; X32-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i32 +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0 +; X32-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X32: res_block: +; X32-NEXT: [[TMP10:%.*]] = icmp ult i32 [[TMP6]], [[TMP7]] +; X32-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X32-NEXT: br label %endblock +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp2( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i16* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i16* +; X64-NEXT: [[TMP2:%.*]] = load i16, i16* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i16 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 +; X64-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call @@ -24,9 +62,45 @@ } define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp4( +; X32-NEXT: loadbb: +; X32-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* +; X32-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X32-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0 +; X32-NEXT: br i1 [[TMP7]], label %res_block, label %endblock +; X32: res_block: +; X32-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; X32-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; X32-NEXT: br label %endblock +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ] +; X32-NEXT: ret i32 [[PHI_RES]] +; +; X64-LABEL: @cmp4( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i32* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i32* +; X64-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; X64-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; X64-NEXT: [[TMP8:%.*]] = sub i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 +; X64-NEXT: br i1 [[TMP9]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP6]], [[TMP7]] +; X64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP11]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) ret i32 %call @@ -60,9 +134,28 @@ } define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp8( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) -; ALL-NEXT: ret i32 [[CALL]] +; X32-LABEL: @cmp8( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: ret i32 [[CALL]] +; +; X64-LABEL: @cmp8( +; X64-NEXT: loadbb: +; X64-NEXT: [[TMP0:%.*]] = bitcast i8* %x to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP2:%.*]] = load i64, i64* [[TMP0]] +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = sub i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0 +; X64-NEXT: br i1 [[TMP7]], label %res_block, label %endblock +; X64: res_block: +; X64-NEXT: [[TMP8:%.*]] = icmp ult i64 [[TMP4]], [[TMP5]] +; X64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 -1, i32 1 +; X64-NEXT: br label %endblock +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, %loadbb ], [ [[TMP9]], %res_block ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) ret i32 %call @@ -142,8 +235,13 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16* +; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -168,8 +266,13 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32* +; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] +; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] +; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] ; @@ -219,11 +322,22 @@ } define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq8( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X32-LABEL: @cmp_eq8( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; +; X64-LABEL: @cmp_eq8( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] +; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] +; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) %cmp = icmp eq i32 %call, 0 Index: test/Transforms/IndVarSimplify/huge_muls.ll =================================================================== --- /dev/null +++ test/Transforms/IndVarSimplify/huge_muls.ll @@ -0,0 +1,87 @@ +; RUN: opt < %s -indvars -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This test takes excessively long time if SCEV tries to construct huge +; SCEVMulExpr's (with ~1000 ops) due to non-linear analysis cost. +define i32 @test() { +; CHECK-LABEL: @test( +bci_0: + br label %bci_12 + +bci_133: ; preds = %bci_127.unr-lcssa + ret i32 %tmp17 + +bci_12: ; preds = %bci_127.unr-lcssa, %bci_0 + %indvars.iv184 = phi i64 [ %indvars.iv.next185, %bci_127.unr-lcssa ], [ 3, %bci_0 ] + %tmp1 = trunc i64 %indvars.iv184 to i32 + br label %bci_55.postloop + +bci_127.unr-lcssa: ; preds = %bci_90.postloop + %indvars.iv.next185 = add nuw nsw i64 %indvars.iv184, 1 + %tmp4 = icmp sgt i64 %indvars.iv184, 91 + br i1 %tmp4, label %bci_133, label %bci_12 + +bci_55.postloop: ; preds = %bci_90.postloop, %bci_12 + %indvars.iv180.postloop = phi i64 [ %indvars.iv.next181.postloop, %bci_90.postloop ], [ 15, %bci_12 ] + %local_2_16.postloop = phi i32 [ %tmp17, %bci_90.postloop ], [ 4, %bci_12 ] + %indvars.iv.next181.postloop = add nuw nsw i64 %indvars.iv180.postloop, 1 + %tmp6 = load i32, i32 addrspace(1)* undef, align 4 + %tmp7 = mul i32 %tmp6, %tmp1 + br label %not_zero65.us.postloop + +not_zero65.us.postloop: ; preds = %not_zero65.us.postloop.1, %bci_55.postloop + %local_2_24.us.postloop = phi i32 [ %local_2_16.postloop, %bci_55.postloop ], [ %tmp49, %not_zero65.us.postloop.1 ] + %local_6_.us.postloop = phi i32 [ 3, %bci_55.postloop ], [ %tmp50, %not_zero65.us.postloop.1 ] + %tmp8 = mul i32 %tmp7, %local_2_24.us.postloop + %tmp9 = mul i32 %tmp8, %local_2_24.us.postloop + %tmp10 = mul i32 %tmp7, %tmp9 + %tmp11 = mul i32 %tmp10, %tmp9 + %tmp12 = mul i32 %tmp7, %tmp11 + %tmp13 = mul i32 %tmp12, %tmp11 + %tmp14 = mul i32 %tmp7, %tmp13 + %tmp15 = mul i32 %tmp14, %tmp13 + %tmp16 = mul i32 %tmp7, %tmp15 + %tmp17 = mul i32 %tmp16, %tmp15 + %tmp18 = icmp sgt i32 %local_6_.us.postloop, 82 + br i1 %tmp18, label %bci_90.postloop, label %not_zero65.us.postloop.1 + +bci_90.postloop: ; preds = %not_zero65.us.postloop + %tmp19 = icmp sgt i64 %indvars.iv180.postloop, 68 + br i1 %tmp19, label %bci_127.unr-lcssa, label %bci_55.postloop + +not_zero65.us.postloop.1: ; preds = %not_zero65.us.postloop + %tmp20 = mul i32 %tmp7, %tmp17 + %tmp21 = mul i32 %tmp20, %tmp17 + %tmp22 = mul i32 %tmp7, %tmp21 + %tmp23 = mul i32 %tmp22, %tmp21 + %tmp24 = mul i32 %tmp7, %tmp23 + %tmp25 = mul i32 %tmp24, %tmp23 + %tmp26 = mul i32 %tmp7, %tmp25 + %tmp27 = mul i32 %tmp26, %tmp25 + %tmp28 = mul i32 %tmp7, %tmp27 + %tmp29 = mul i32 %tmp28, %tmp27 + %tmp30 = mul i32 %tmp7, %tmp29 + %tmp31 = mul i32 %tmp30, %tmp29 + %tmp32 = mul i32 %tmp7, %tmp31 + %tmp33 = mul i32 %tmp32, %tmp31 + %tmp34 = mul i32 %tmp7, %tmp33 + %tmp35 = mul i32 %tmp34, %tmp33 + %tmp36 = mul i32 %tmp7, %tmp35 + %tmp37 = mul i32 %tmp36, %tmp35 + %tmp38 = mul i32 %tmp7, %tmp37 + %tmp39 = mul i32 %tmp38, %tmp37 + %tmp40 = mul i32 %tmp7, %tmp39 + %tmp41 = mul i32 %tmp40, %tmp39 + %tmp42 = mul i32 %tmp7, %tmp41 + %tmp43 = mul i32 %tmp42, %tmp41 + %tmp44 = mul i32 %tmp7, %tmp43 + %tmp45 = mul i32 %tmp44, %tmp43 + %tmp46 = mul i32 %tmp7, %tmp45 + %tmp47 = mul i32 %tmp46, %tmp45 + %tmp48 = mul i32 %tmp7, %tmp47 + %tmp49 = mul i32 %tmp48, %tmp47 + %tmp50 = add nsw i32 %local_6_.us.postloop, 20 + br label %not_zero65.us.postloop +} Index: test/Transforms/LoopVectorize/AMDGPU/packed-math.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AMDGPU/packed-math.ll @@ -0,0 +1,34 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s + +; GCN-LABEL: @vectorize_v2f16_loop( +; GFX9: vector.body: +; GFX9: phi <2 x half> +; GFX9: load <2 x half> +; GFX9: fadd fast <2 x half> + +; GFX9: middle.block: +; GFX9: fadd fast <2 x half> + +; VI: phi half +; VI: phi load half +; VI: fadd fast half +define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv + %0 = load half, half addrspace(1)* %arrayidx, align 2 + %add = fadd fast half %q.04, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + %add.lcssa = phi half [ %add, %for.body ] + ret half %add.lcssa +} Index: test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -0,0 +1,195 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s + +; FIXME: Should still like to vectorize the memory operations for VI + +; Simple 3-pair chain with loads and stores +; GCN-LABEL: @test1_as_3_3_3_v2f16( +; GFX9: load <2 x half>, <2 x half> addrspace(3)* +; GFX9: load <2 x half>, <2 x half> addrspace(3)* +; GFX9: fmul <2 x half> +; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX9: ret + +; VI: load half +; VI: load half +define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { + %i0 = load half, half addrspace(3)* %a, align 2 + %i1 = load half, half addrspace(3)* %b, align 2 + %mul = fmul half %i0, %i1 + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 + %i4 = load half, half addrspace(3)* %arrayidx4, align 2 + %mul5 = fmul half %i3, %i4 + store half %mul, half addrspace(3)* %c, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + store half %mul5, half addrspace(3)* %arrayidx5, align 2 + ret void +} + +; GCN-LABEL: @test1_as_3_0_0( +; GFX9: load <2 x half>, <2 x half> addrspace(3)* +; GFX9: load <2 x half>, <2 x half>* +; GFX9: fmul <2 x half> +; GFX9: store <2 x half> %{{.*}}, <2 x half>* % +; GFX9: ret + +; VI: load half +; VI: load half +define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { + %i0 = load half, half addrspace(3)* %a, align 2 + %i1 = load half, half* %b, align 2 + %mul = fmul half %i0, %i1 + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 + %i4 = load half, half* %arrayidx4, align 2 + %mul5 = fmul half %i3, %i4 + store half %mul, half* %c, align 2 + %arrayidx5 = getelementptr inbounds half, half* %c, i64 1 + store half %mul5, half* %arrayidx5, align 2 + ret void +} + +; GCN-LABEL: @test1_as_0_0_3_v2f16( +; GFX9: load <2 x half>, <2 x half>* +; GFX9: load <2 x half>, <2 x half>* +; GFX9: fmul <2 x half> +; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX9: ret + +; VI: load half +; VI: load half +define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { + %i0 = load half, half* %a, align 2 + %i1 = load half, half* %b, align 2 + %mul = fmul half %i0, %i1 + %arrayidx3 = getelementptr inbounds half, half* %a, i64 1 + %i3 = load half, half* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half* %b, i64 1 + %i4 = load half, half* %arrayidx4, align 2 + %mul5 = fmul half %i3, %i4 + store half %mul, half addrspace(3)* %c, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + store half %mul5, half addrspace(3)* %arrayidx5, align 2 + ret void +} + +; GCN-LABEL: @test1_fma_v2f16( +; GFX9: load <2 x half> +; GFX9: load <2 x half> +; GFX9: load <2 x half> +; GFX9: call <2 x half> @llvm.fma.v2f16( +; GFX9: store <2 x half> +define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { + %i0 = load half, half addrspace(3)* %a, align 2 + %i1 = load half, half addrspace(3)* %b, align 2 + %i2 = load half, half addrspace(3)* %c, align 2 + %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2) + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 + %i4 = load half, half addrspace(3)* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + %i5 = load half, half addrspace(3)* %arrayidx5, align 2 + %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) + store half %fma0, half addrspace(3)* %d, align 2 + %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 + store half %fma1, half addrspace(3)* %arrayidx6, align 2 + ret void +} + +; GCN-LABEL: @mul_scalar_v2f16( +; GFX9: load <2 x half> +; GFX9: fmul <2 x half> +; GFX9: store <2 x half> +define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) { + %i0 = load half, half addrspace(3)* %a, align 2 + %mul = fmul half %i0, %scalar + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %mul5 = fmul half %i3, %scalar + store half %mul, half addrspace(3)* %c, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + store half %mul5, half addrspace(3)* %arrayidx5, align 2 + ret void +} + +; GCN-LABEL: @fabs_v2f16 +; GFX9: load <2 x half> +; GFX9: call <2 x half> @llvm.fabs.v2f16( +; GFX9: store <2 x half> +define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) { + %i0 = load half, half addrspace(3)* %a, align 2 + %fabs0 = call half @llvm.fabs.f16(half %i0) + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %fabs1 = call half @llvm.fabs.f16(half %i3) + store half %fabs0, half addrspace(3)* %c, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + store half %fabs1, half addrspace(3)* %arrayidx5, align 2 + ret void +} + +; GCN-LABEL: @test1_fabs_fma_v2f16( +; GFX9: load <2 x half> +; GFX9: call <2 x half> @llvm.fabs.v2f16( +; GFX9: call <2 x half> @llvm.fma.v2f16( +; GFX9: store <2 x half> +define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { + %i0 = load half, half addrspace(3)* %a, align 2 + %i1 = load half, half addrspace(3)* %b, align 2 + %i2 = load half, half addrspace(3)* %c, align 2 + %i0.fabs = call half @llvm.fabs.f16(half %i0) + + %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2) + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 + %i4 = load half, half addrspace(3)* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + %i5 = load half, half addrspace(3)* %arrayidx5, align 2 + %i3.fabs = call half @llvm.fabs.f16(half %i3) + + %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5) + store half %fma0, half addrspace(3)* %d, align 2 + %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 + store half %fma1, half addrspace(3)* %arrayidx6, align 2 + ret void +} + +; FIXME: Should do vector load and extract component for fabs +; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( +; GFX9: load half +; GFX9: call half @llvm.fabs.f16( +; GFX9: load <2 x half> +; GFX9: load half +; GFX9: load <2 x half> +; GFX9: call <2 x half> @llvm.fma.v2f16( +; GFX9: store <2 x half> +define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) { + %i0 = load half, half addrspace(3)* %a, align 2 + %i1 = load half, half addrspace(3)* %b, align 2 + %i2 = load half, half addrspace(3)* %c, align 2 + %i1.fabs = call half @llvm.fabs.f16(half %i1) + + %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2) + %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1 + %i3 = load half, half addrspace(3)* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1 + %i4 = load half, half addrspace(3)* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1 + %i5 = load half, half addrspace(3)* %arrayidx5, align 2 + %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5) + store half %fma0, half addrspace(3)* %d, align 2 + %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1 + store half %fma1, half addrspace(3)* %arrayidx6, align 2 + ret void +} + +declare half @llvm.fabs.f16(half) #1 +declare half @llvm.fma.f16(half, half, half) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll =================================================================== --- test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s -; XFAIL: * -; -; FIXME: If this test expects to be vectorized, the TTI must indicate that the target -; has vector registers of the expected width. -; Currently, it says there are 8 vector registers that are 32-bits wide. - -target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" - - -; Simple 3-pair chain with loads and stores -define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) { -; CHECK-LABEL: @test1_as_3_3_3( -; CHECK: load <2 x double>, <2 x double> addrspace(3)* -; CHECK: load <2 x double>, <2 x double> addrspace(3)* -; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* % -; CHECK: ret - %i0 = load double, double addrspace(3)* %a, align 8 - %i1 = load double, double addrspace(3)* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1 - %i3 = load double, double addrspace(3)* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double addrspace(3)* %b, i64 1 - %i4 = load double, double addrspace(3)* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double addrspace(3)* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1 - store double %mul5, double addrspace(3)* %arrayidx5, align 8 - ret void -} - -define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) { -; CHECK-LABEL: @test1_as_3_0_0( -; CHECK: load <2 x double>, <2 x double> addrspace(3)* -; CHECK: load <2 x double>, <2 x double>* -; CHECK: store <2 x double> %{{.*}}, <2 x double>* % -; CHECK: ret - %i0 = load double, double addrspace(3)* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double addrspace(3)* %a, i64 1 - %i3 = load double, double addrspace(3)* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double* %c, i64 1 - store double %mul5, double* %arrayidx5, align 8 - ret void -} - -define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) { -; CHECK-LABEL: @test1_as_0_0_3( -; CHECK: load <2 x double>, <2 x double>* -; CHECK: load <2 x double>, <2 x double>* -; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* % -; CHECK: ret - %i0 = load double, double* %a, align 8 - %i1 = load double, double* %b, align 8 - %mul = fmul double %i0, %i1 - %arrayidx3 = getelementptr inbounds double, double* %a, i64 1 - %i3 = load double, double* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 - %i4 = load double, double* %arrayidx4, align 8 - %mul5 = fmul double %i3, %i4 - store double %mul, double addrspace(3)* %c, align 8 - %arrayidx5 = getelementptr inbounds double, double addrspace(3)* %c, i64 1 - store double %mul5, double addrspace(3)* %arrayidx5, align 8 - ret void -} Index: test/tools/llvm-objdump/X86/macho-info-plist.test =================================================================== --- test/tools/llvm-objdump/X86/macho-info-plist.test +++ test/tools/llvm-objdump/X86/macho-info-plist.test @@ -1,7 +1,11 @@ # RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -info-plist - | FileCheck %s +# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -info-plist -no-leading-headers - | FileCheck --check-prefix=NOHEADER %s .section __TEXT, __info_plist .asciz "\n" # CHECK: Contents of (__TEXT,__info_plist) section # CHECK: + +# NOHEADER-NOT: Contents of (__TEXT,__info_plist) section +# NOHEADER: Index: test/tools/llvm-objdump/X86/macho-objc-meta-data.test =================================================================== --- test/tools/llvm-objdump/X86/macho-objc-meta-data.test +++ test/tools/llvm-objdump/X86/macho-objc-meta-data.test @@ -1042,7 +1042,7 @@ OBJC2_64BIT_DYLIB: Contents of (__DATA_CONST,__objc_classlist) section OBJC2_64BIT_DYLIB: 000000000000c038 0x8030 _OBJC_CLASS_$_Test OBJC2_64BIT_DYLIB: isa 0x8008 _OBJC_METACLASS_$_Test -OBJC2_64BIT_DYLIB: superclass 0x0 +OBJC2_64BIT_DYLIB: superclass 0x0 _OBJC_CLASS_$_NSObject OBJC2_64BIT_DYLIB: cache 0x0 OBJC2_64BIT_DYLIB: vtable 0x0 OBJC2_64BIT_DYLIB: data 0xc120 (struct class_ro_t *) @@ -1081,7 +1081,7 @@ OBJC2_64BIT_DYLIB: attributes 0x4f4b TQ,V_testProp OBJC2_64BIT_DYLIB: Meta Class OBJC2_64BIT_DYLIB: isa 0x0 -OBJC2_64BIT_DYLIB: superclass 0x0 +OBJC2_64BIT_DYLIB: superclass 0x0 _OBJC_METACLASS_$_NSObject OBJC2_64BIT_DYLIB: cache 0x0 OBJC2_64BIT_DYLIB: vtable 0x0 OBJC2_64BIT_DYLIB: data 0xc048 (struct class_ro_t *) Index: tools/llvm-cvtres/llvm-cvtres.cpp =================================================================== --- tools/llvm-cvtres/llvm-cvtres.cpp +++ tools/llvm-cvtres/llvm-cvtres.cpp @@ -37,7 +37,7 @@ enum ID { OPT_INVALID = 0, // This is not an option ID. #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR) \ + HELPTEXT, METAVAR, VALUES) \ OPT_##ID, #include "Opts.inc" #undef OPTION @@ -49,12 +49,12 @@ static const opt::OptTable::Info InfoTable[] = { #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR) \ + HELPTEXT, METAVAR, VALUES) \ { \ - PREFIX, NAME, HELPTEXT, \ - METAVAR, OPT_##ID, opt::Option::KIND##Class, \ - PARAM, FLAGS, OPT_##GROUP, \ - OPT_##ALIAS, ALIASARGS}, + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, #include "Opts.inc" #undef OPTION }; Index: tools/llvm-objdump/MachODump.cpp =================================================================== --- tools/llvm-objdump/MachODump.cpp +++ tools/llvm-objdump/MachODump.cpp @@ -1135,7 +1135,8 @@ DataRefImpl Ref = Section.getRawDataRefImpl(); StringRef SegName = O->getSectionFinalSegmentName(Ref); if (SegName == "__TEXT" && SectName == "__info_plist") { - outs() << "Contents of (" << SegName << "," << SectName << ") section\n"; + if (!NoLeadingHeaders) + outs() << "Contents of (" << SegName << "," << SectName << ") section\n"; StringRef BytesStr; Section.getContents(BytesStr); const char *sect = reinterpret_cast(BytesStr.data()); @@ -4572,6 +4573,12 @@ n_value, c.superclass); if (name != nullptr) outs() << " " << name; + else { + name = get_dyld_bind_info_symbolname(S.getAddress() + + offset + offsetof(struct class64_t, superclass), info); + if (name != nullptr) + outs() << " " << name; + } outs() << "\n"; outs() << " cache " << format("0x%" PRIx64, c.cache); Index: tools/llvm-pdbutil/Diff.cpp =================================================================== --- tools/llvm-pdbutil/Diff.cpp +++ tools/llvm-pdbutil/Diff.cpp @@ -198,17 +198,6 @@ File2.getBlockCount()); Diffs |= diffAndPrint("Unknown 1", File1, File2, File1.getUnknown1(), File2.getUnknown1()); - - if (opts::diff::Pedantic) { - Diffs |= diffAndPrint("Free Block Map", File1, File2, - File1.getFreeBlockMapBlock(), - File2.getFreeBlockMapBlock()); - Diffs |= diffAndPrint("Directory Size", File1, File2, - File1.getNumDirectoryBytes(), - File2.getNumDirectoryBytes()); - Diffs |= diffAndPrint("Block Map Addr", File1, File2, - File1.getBlockMapOffset(), File2.getBlockMapOffset()); - } if (!Diffs) outs() << "MSF Super Block: No differences detected...\n"; return Error::success(); @@ -222,114 +211,72 @@ outs() << "Stream Directory: Searching for differences...\n"; bool HasDifferences = false; - if (opts::diff::Pedantic) { - size_t Min = std::min(P.size(), Q.size()); - for (size_t I = 0; I < Min; ++I) { - StringRef Names[] = {P[I], Q[I]}; - uint32_t Sizes[] = {File1.getStreamByteSize(I), - File2.getStreamByteSize(I)}; - bool NamesDiffer = Names[0] != Names[1]; - bool SizesDiffer = Sizes[0] != Sizes[1]; - if (NamesDiffer) { - HasDifferences = true; - outs().indent(2) << formatv("Stream {0} - {1}: {2}, {3}: {4}\n", I, - File1.getFilePath(), Names[0], - File2.getFilePath(), Names[1]); - continue; - } - if (SizesDiffer) { - HasDifferences = true; - outs().indent(2) << formatv( - "Stream {0} ({1}): {2}: {3} bytes, {4}: {5} bytes\n", I, Names[0], - File1.getFilePath(), Sizes[0], File2.getFilePath(), Sizes[1]); - continue; - } - } + auto PI = to_vector<32>(enumerate(P)); + auto QI = to_vector<32>(enumerate(Q)); - ArrayRef MaxNames = (P.size() > Q.size() ? P : Q); - size_t Max = std::max(P.size(), Q.size()); - PDBFile &MaxFile = (P.size() > Q.size() ? File1 : File2); - StringRef MinFileName = - (P.size() < Q.size() ? File1.getFilePath() : File2.getFilePath()); - for (size_t I = Min; I < Max; ++I) { - HasDifferences = true; - StringRef StreamName = MaxNames[I]; - - outs().indent(2) << formatv( - "Stream {0} - {1}: , {2}: Index {3}, {4} bytes\n", - StreamName, MinFileName, MaxFile.getFilePath(), I, - MaxFile.getStreamByteSize(I)); - } - if (!HasDifferences) - outs() << "Stream Directory: No differences detected...\n"; - } else { - auto PI = to_vector<32>(enumerate(P)); - auto QI = to_vector<32>(enumerate(Q)); - - typedef decltype(PI) ContainerType; - typedef typename ContainerType::value_type value_type; - - auto Comparator = [](const value_type &I1, const value_type &I2) { - return I1.value() < I2.value(); - }; - - decltype(PI) OnlyP; - decltype(QI) OnlyQ; - decltype(PI) Common; - - set_differences(PI, QI, &OnlyP, &OnlyQ, &Common, Comparator); - - if (!OnlyP.empty()) { - HasDifferences = true; - outs().indent(2) << formatv("{0} Stream(s) only in ({1})\n", OnlyP.size(), - File1.getFilePath()); - for (auto &Item : OnlyP) { - outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(), - Item.value()); - } + typedef decltype(PI) ContainerType; + typedef typename ContainerType::value_type value_type; + + auto Comparator = [](const value_type &I1, const value_type &I2) { + return I1.value() < I2.value(); + }; + + decltype(PI) OnlyP; + decltype(QI) OnlyQ; + decltype(PI) Common; + + set_differences(PI, QI, &OnlyP, &OnlyQ, &Common, Comparator); + + if (!OnlyP.empty()) { + HasDifferences = true; + outs().indent(2) << formatv("{0} Stream(s) only in ({1})\n", OnlyP.size(), + File1.getFilePath()); + for (auto &Item : OnlyP) { + outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(), + Item.value()); } + } - if (!OnlyQ.empty()) { - HasDifferences = true; - outs().indent(2) << formatv("{0} Streams(s) only in ({1})\n", - OnlyQ.size(), File2.getFilePath()); - for (auto &Item : OnlyQ) { - outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(), - Item.value()); - } + if (!OnlyQ.empty()) { + HasDifferences = true; + outs().indent(2) << formatv("{0} Streams(s) only in ({1})\n", OnlyQ.size(), + File2.getFilePath()); + for (auto &Item : OnlyQ) { + outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(), + Item.value()); } - if (!Common.empty()) { - outs().indent(2) << formatv("Found {0} common streams. Searching for " - "intra-stream differences.\n", - Common.size()); - bool HasCommonDifferences = false; - for (const auto &Left : Common) { - // Left was copied from the first range so its index refers to a stream - // index in the first file. Find the corresponding stream index in the - // second file. - auto Range = - std::equal_range(QI.begin(), QI.end(), Left, - [](const value_type &L, const value_type &R) { - return L.value() < R.value(); - }); - const auto &Right = *Range.first; - assert(Left.value() == Right.value()); - uint32_t LeftSize = File1.getStreamByteSize(Left.index()); - uint32_t RightSize = File2.getStreamByteSize(Right.index()); - if (LeftSize != RightSize) { - HasDifferences = true; - HasCommonDifferences = true; - outs().indent(4) << formatv("{0} ({1}: {2} bytes, {3}: {4} bytes)\n", - Left.value(), File1.getFilePath(), - LeftSize, File2.getFilePath(), RightSize); - } + } + if (!Common.empty()) { + outs().indent(2) << formatv("Found {0} common streams. Searching for " + "intra-stream differences.\n", + Common.size()); + bool HasCommonDifferences = false; + for (const auto &Left : Common) { + // Left was copied from the first range so its index refers to a stream + // index in the first file. Find the corresponding stream index in the + // second file. + auto Range = + std::equal_range(QI.begin(), QI.end(), Left, + [](const value_type &L, const value_type &R) { + return L.value() < R.value(); + }); + const auto &Right = *Range.first; + assert(Left.value() == Right.value()); + uint32_t LeftSize = File1.getStreamByteSize(Left.index()); + uint32_t RightSize = File2.getStreamByteSize(Right.index()); + if (LeftSize != RightSize) { + HasDifferences = true; + HasCommonDifferences = true; + outs().indent(4) << formatv("{0} ({1}: {2} bytes, {3}: {4} bytes)\n", + Left.value(), File1.getFilePath(), LeftSize, + File2.getFilePath(), RightSize); } - if (!HasCommonDifferences) - outs().indent(2) << "Common Streams: No differences detected!\n"; } - if (!HasDifferences) - outs() << "Stream Directory: No differences detected!\n"; + if (!HasCommonDifferences) + outs().indent(2) << "Common Streams: No differences detected!\n"; } + if (!HasDifferences) + outs() << "Stream Directory: No differences detected!\n"; return Error::success(); } @@ -384,77 +331,39 @@ auto IdList1 = ST1.name_ids(); auto IdList2 = ST2.name_ids(); - if (opts::diff::Pedantic) { - // In pedantic mode, we compare index by index (i.e. the strings are in the - // same order - // in both tables. - uint32_t Max = std::max(IdList1.size(), IdList2.size()); - for (uint32_t I = 0; I < Max; ++I) { - Optional Id1, Id2; - StringRef S1, S2; - if (I < IdList1.size()) { - Id1 = IdList1[I]; - if (auto Result = ST1.getStringForID(*Id1)) - S1 = *Result; - else - return Result.takeError(); - } - if (I < IdList2.size()) { - Id2 = IdList2[I]; - if (auto Result = ST2.getStringForID(*Id2)) - S2 = *Result; - else - return Result.takeError(); - } - if (Id1 == Id2 && S1 == S2) - continue; - - std::string OutId1 = - Id1 ? formatv("{0}", *Id1).str() : "(index not present)"; - std::string OutId2 = - Id2 ? formatv("{0}", *Id2).str() : "(index not present)"; - outs() << formatv(" String {0}\n", I); - outs() << formatv(" {0}: Hash - {1}, Value - {2}\n", - File1.getFilePath(), OutId1, S1); - outs() << formatv(" {0}: Hash - {1}, Value - {2}\n", - File2.getFilePath(), OutId2, S2); - HasDiff = true; - } - } else { - std::vector Strings1, Strings2; - Strings1.reserve(IdList1.size()); - Strings2.reserve(IdList2.size()); - for (auto ID : IdList1) { - auto S = ST1.getStringForID(ID); - if (!S) - return S.takeError(); - Strings1.push_back(*S); - } - for (auto ID : IdList2) { - auto S = ST2.getStringForID(ID); - if (!S) - return S.takeError(); - Strings2.push_back(*S); - } + std::vector Strings1, Strings2; + Strings1.reserve(IdList1.size()); + Strings2.reserve(IdList2.size()); + for (auto ID : IdList1) { + auto S = ST1.getStringForID(ID); + if (!S) + return S.takeError(); + Strings1.push_back(*S); + } + for (auto ID : IdList2) { + auto S = ST2.getStringForID(ID); + if (!S) + return S.takeError(); + Strings2.push_back(*S); + } - SmallVector OnlyP; - SmallVector OnlyQ; - auto End1 = std::remove(Strings1.begin(), Strings1.end(), ""); - auto End2 = std::remove(Strings2.begin(), Strings2.end(), ""); - uint32_t Empty1 = std::distance(End1, Strings1.end()); - uint32_t Empty2 = std::distance(End2, Strings2.end()); - Strings1.erase(End1, Strings1.end()); - Strings2.erase(End2, Strings2.end()); - set_differences(Strings1, Strings2, &OnlyP, &OnlyQ); - printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "String"); - - if (Empty1 != Empty2) { - PDBFile &MoreF = (Empty1 > Empty2) ? File1 : File2; - PDBFile &LessF = (Empty1 < Empty2) ? File1 : File2; - uint32_t Difference = AbsoluteDifference(Empty1, Empty2); - outs() << formatv(" {0} had {1} more empty strings than {2}\n", - MoreF.getFilePath(), Difference, LessF.getFilePath()); - } + SmallVector OnlyP; + SmallVector OnlyQ; + auto End1 = std::remove(Strings1.begin(), Strings1.end(), ""); + auto End2 = std::remove(Strings2.begin(), Strings2.end(), ""); + uint32_t Empty1 = std::distance(End1, Strings1.end()); + uint32_t Empty2 = std::distance(End2, Strings2.end()); + Strings1.erase(End1, Strings1.end()); + Strings2.erase(End2, Strings2.end()); + set_differences(Strings1, Strings2, &OnlyP, &OnlyQ); + printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "String"); + + if (Empty1 != Empty2) { + PDBFile &MoreF = (Empty1 > Empty2) ? File1 : File2; + PDBFile &LessF = (Empty1 < Empty2) ? File1 : File2; + uint32_t Difference = AbsoluteDifference(Empty1, Empty2); + outs() << formatv(" {0} had {1} more empty strings than {2}\n", + MoreF.getFilePath(), Difference, LessF.getFilePath()); } if (!HasDiff) outs() << "String Table: No differences detected!\n"; Index: tools/llvm-pdbutil/llvm-pdbutil.h =================================================================== --- tools/llvm-pdbutil/llvm-pdbutil.h +++ tools/llvm-pdbutil/llvm-pdbutil.h @@ -127,10 +127,6 @@ extern llvm::cl::opt RawAll; } -namespace diff { -extern llvm::cl::opt Pedantic; -} - namespace pdb2yaml { extern llvm::cl::opt All; extern llvm::cl::opt NoFileHeaders; Index: tools/llvm-pdbutil/llvm-pdbutil.cpp =================================================================== --- tools/llvm-pdbutil/llvm-pdbutil.cpp +++ tools/llvm-pdbutil/llvm-pdbutil.cpp @@ -256,11 +256,6 @@ } namespace diff { -cl::opt Pedantic("pedantic", - cl::desc("Finds all differences (even structural ones " - "that produce otherwise identical PDBs)"), - cl::sub(DiffSubcommand)); - cl::list InputFilenames(cl::Positional, cl::desc(" "), cl::OneOrMore, cl::sub(DiffSubcommand)); Index: unittests/Option/OptionParsingTest.cpp =================================================================== --- unittests/Option/OptionParsingTest.cpp +++ unittests/Option/OptionParsingTest.cpp @@ -18,8 +18,9 @@ enum ID { OPT_INVALID = 0, // This is not an option ID. -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR) OPT_##ID, +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, #include "Opts.inc" LastOption #undef OPTION @@ -36,10 +37,10 @@ }; static const OptTable::Info InfoTable[] = { -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR) \ - { PREFIX, NAME, HELPTEXT, METAVAR, OPT_##ID, Option::KIND##Class, PARAM, \ - FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS }, +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + {PREFIX, NAME, HELPTEXT, METAVAR, OPT_##ID, Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS, VALUES}, #include "Opts.inc" #undef OPTION }; Index: unittests/Support/CommandLineTest.cpp =================================================================== --- unittests/Support/CommandLineTest.cpp +++ unittests/Support/CommandLineTest.cpp @@ -13,6 +13,7 @@ #include "llvm/Config/config.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" #include "llvm/Support/StringSaver.h" #include "gtest/gtest.h" #include @@ -546,6 +547,11 @@ } } +TEST(CommandLineTest, ArgumentLimit) { + std::string args(32 * 4096, 'a'); + EXPECT_FALSE(llvm::sys::commandLineFitsWithinSystemLimits("cl", args.data())); +} + TEST(CommandLineTest, ResponseFiles) { llvm::SmallString<128> TestDir; std::error_code EC = Index: unittests/Support/ErrorTest.cpp =================================================================== --- unittests/Support/ErrorTest.cpp +++ unittests/Support/ErrorTest.cpp @@ -475,6 +475,10 @@ int X = cantFail(Expected(42)); EXPECT_EQ(X, 42) << "Expected value modified by cantFail"; + + int Dummy = 42; + int &Y = cantFail(Expected(Dummy)); + EXPECT_EQ(&Dummy, &Y) << "Reference mangled by cantFail"; } // Test that cantFail results in a crash if you pass it a failure value. Index: utils/TableGen/CodeGenDAGPatterns.cpp =================================================================== --- utils/TableGen/CodeGenDAGPatterns.cpp +++ utils/TableGen/CodeGenDAGPatterns.cpp @@ -2762,8 +2762,8 @@ AnalyzeNode(Pat->getTree(0)); } - void Analyze(const PatternToMatch *Pat) { - AnalyzeNode(Pat->getSrcPattern()); + void Analyze(const PatternToMatch &Pat) { + AnalyzeNode(Pat.getSrcPattern()); } private: @@ -3289,9 +3289,7 @@ // Second, look for single-instruction patterns defined outside the // instruction. - for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) { - const PatternToMatch &PTM = *I; - + for (const PatternToMatch &PTM : ptms()) { // We can only infer from single-instruction patterns, otherwise we won't // know which instruction should get the flags. SmallVector PatInstrs; @@ -3307,7 +3305,7 @@ continue; InstAnalyzer PatInfo(*this); - PatInfo.Analyze(&PTM); + PatInfo.Analyze(PTM); Errors += InferFromPattern(InstInfo, PatInfo, PTM.getSrcRecord()); } @@ -3367,7 +3365,7 @@ // Analyze the source pattern. InstAnalyzer PatInfo(*this); - PatInfo.Analyze(&PTM); + PatInfo.Analyze(PTM); // Collect error messages. SmallVector Msgs; Index: utils/TableGen/OptParserEmitter.cpp =================================================================== --- utils/TableGen/OptParserEmitter.cpp +++ utils/TableGen/OptParserEmitter.cpp @@ -196,6 +196,9 @@ OS << ", nullptr"; // The option meta-variable name (unused). + OS << ", nullptr"; + + // The option Values (unused for groups). OS << ", nullptr)\n"; } OS << "\n"; @@ -285,6 +288,13 @@ else OS << "nullptr"; + // The option Values. Used for shell autocompletion. + OS << ", "; + if (!isa(R.getValueInit("Values"))) + write_cstring(OS, R.getValueAsString("Values")); + else + OS << "nullptr"; + OS << ")\n"; } OS << "#endif // OPTION\n";