diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -877,7 +877,7 @@ return getMul64(Builder, LHS, RHS).second; } -/// Figure out how many bits are really needed for this ddivision. \p AtLeast is +/// Figure out how many bits are really needed for this division. \p AtLeast is /// an optimization hint to bypass the second ComputeNumSignBits call if we the /// first one is insufficient. Returns -1 on failure. int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -123,7 +123,7 @@ // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization -// directly before before selecting a glue-less load, so hide this +// directly before selecting a glue-less load, so hide this // distinction. def : GINodeEquiv { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2840,7 +2840,7 @@ } } } - // If "AllUsesAcceptSReg == false" so far we haven't suceeded + // If "AllUsesAcceptSReg == false" so far we haven't succeeded // commuting current user. This means have at least one use // that strictly require VGPR. Thus, we will not attempt to commute // other user instructions. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1627,7 +1627,7 @@ } // The legalizer preprocessed the intrinsic arguments. If we aren't using - // NSA, these should have beeen packed into a single value in the first + // NSA, these should have been packed into a single value in the first // address register const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1505,7 +1505,7 @@ .clampMaxNumElements(1, S16, 2) // TODO: Make 4? .clampMaxNumElements(0, S16, 64); - // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse + // TODO: Don't fully scalarize v2s16 pieces? Or combine out those // pre-legalize. if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) @@ -4370,7 +4370,7 @@ /// /// We don't want to directly select image instructions just yet, but also want /// to exposes all register repacking to the legalizer/combiners. We also don't -/// want a selected instrution entering RegBankSelect. In order to avoid +/// want a selected instruction entering RegBankSelect. In order to avoid /// defining a multitude of intermediate image instructions, directly hack on /// the intrinsic's arguments. In cases like a16 addresses, this requires /// padding now unnecessary arguments with $noreg. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -73,7 +73,7 @@ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); Align MaxAlign; - // FIXME: Alignment is broken broken with explicit arg offset.; + // FIXME: Alignment is broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -14,7 +14,7 @@ // known address. AMDGPUMachineFunction allocates the LDS global. // // Local variables with constant annotation or non-undef initializer are passed -// through unchanged for simplication or error diagnostics in later passes. +// through unchanged for simplification or error diagnostics in later passes. // // To reduce the memory overhead variables that are only used by kernels are // excluded from this transform. The analysis to determine whether a variable diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -1295,7 +1295,7 @@ } } -// If a region region is just a sequence of regions (and the exit +// If a region is just a sequence of regions (and the exit // block in the case of the top level region), we can simply skip // linearizing it, because it is already linear bool regionIsSequence(RegionMRT *Region) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -71,7 +71,7 @@ return new AMDGPUOpenCLEnqueuedBlockLowering(); } -/// Collect direct or indrect callers of \p F and save them +/// Collect direct or indirect callers of \p F and save them /// to \p Callers. static void collectCallers(Function *F, DenseSet &Callers) { for (auto U : F->users()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -917,7 +917,7 @@ // usage order. // // FIXME: It is also possible that if we're allowed to use all of the memory - // could could end up using more than the maximum due to alignment padding. + // could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -76,7 +76,7 @@ LD->getPointerOperand()->stripInBoundsOffsets() != Ptr) break; const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD); - // TODO: This load poprobably can be promoted to constant address space. + // TODO: This load probably can be promoted to constant address space. if (MSSA->isLiveOnEntryDef(MA)) Ptrs.push_back(LD); break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1789,7 +1789,7 @@ } /// Utility function for pushing dynamic vector indexes with a constant offset -/// into waterwall loops. +/// into waterfall loops. static void reinsertVectorIndexAdd(MachineIRBuilder &B, MachineInstr &IdxUseInstr, unsigned OpIdx, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// The AMDGPU TargetMachine interface definition for hw codgen targets. +/// The AMDGPU TargetMachine interface definition for hw codegen targets. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1487,8 +1487,7 @@ MachineBasicBlock * AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, MachineBasicBlock *PredMBB) { - assert(PredMBB->isSuccessor(MBB) && - "succBlk is not a prececessor of curBlk"); + assert(PredMBB->isSuccessor(MBB) && "succBlk is not a predecessor of curBlk"); MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); diff --git a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h --- a/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/llvm/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -142,7 +142,7 @@ /// is provided to the finalizer when it is invoked and is recorded /// here. The hardware will interleave the memory requests of each /// lane of a wavefront by this element size to ensure each - /// work-item gets a distinct memory memory location. Therefore, the + /// work-item gets a distinct memory location. Therefore, the /// finalizer ensures that all load and store operations done to /// private memory do not exceed this size. For example, if the /// element size is 4 (32-bits or dword) and a 64-bit value must be diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1897,7 +1897,7 @@ // We allow fp literals with f16x2 operands assuming that the specified // literal goes into the lower half and the upper half is zero. We also - // require that the literal may be losslesly converted to f16. + // require that the literal may be losslessly converted to f16. MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : (type == MVT::v2i16)? MVT::i16 : (type == MVT::v2f32)? MVT::f32 : type; @@ -2927,7 +2927,7 @@ // v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF // v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001 // Negative fp literals with preceding "-" are -// handled likewise for unifomtity +// handled likewise for uniformity // bool AMDGPUAsmParser::parseSP3NegModifier() { @@ -6310,7 +6310,7 @@ using namespace llvm::AMDGPU::SendMsg; // Validation strictness depends on whether message is specified - // in a symbolc or in a numeric form. In the latter case + // in a symbolic or in a numeric form. In the latter case // only encoding possibility is checked. bool Strict = Msg.IsSymbolic; @@ -8352,7 +8352,7 @@ #define GET_MNEMONIC_CHECKER #include "AMDGPUGenAsmMatcher.inc" -// This fuction should be defined after auto-generated include so that we have +// This function should be defined after auto-generated include so that we have // MatchClassKind enum defined unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -136,7 +136,7 @@ bits<3> nfmt = format{6-4}; // GFX90A+ only: instruction uses AccVGPR for data - // Bit superceedes tfe. + // Bit supersedes tfe. bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } @@ -370,7 +370,7 @@ bits<8> soffset; // GFX90A+ only: instruction uses AccVGPR for data - // Bit superceedes tfe. + // Bit supersedes tfe. bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -20,7 +20,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// -// Hazard Recoginizer Implementation +// Hazard Recognizer Implementation //===----------------------------------------------------------------------===// static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, @@ -534,7 +534,7 @@ // In order to handle these situations correctly we need to make sure that // when a clause has more than one instruction, no instruction in the clause // writes to a register that is read by another instruction in the clause - // (including itself). If we encounter this situaion, we need to break the + // (including itself). If we encounter this situation, we need to break the // clause by inserting a non SMEM instruction. for (MachineInstr *MI : EmittedInstrs) { diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -1,4 +1,4 @@ -//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,7 +8,7 @@ // /// \file /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential -/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA /// with sequential versions where possible. /// //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -10,7 +10,7 @@ /// This file defines the GCNRegPressure class, which tracks registry pressure /// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It /// also implements a compare function, which compares different register -/// pressures, and declares one with max occupance as winner. +/// pressures, and declares one with max occupancy as winner. /// //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -691,7 +691,7 @@ OS.emitBytes(VendorName); OS.emitInt8(0); // NULL terminate VendorName OS.emitBytes(ArchName); - OS.emitInt8(0); // NULL terminte ArchName + OS.emitInt8(0); // NULL terminate ArchName }); } diff --git a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp --- a/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative manner. /// This pass is merging consecutive CFAlus where applicable. /// It needs to be called after IfCvt for best results. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp --- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -327,9 +327,9 @@ } // end anonymous namespace INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) + "R600 Emit Clause Markers", false, false) INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) + "R600 Emit Clause Markers", false, false) FunctionPass *llvm::createR600EmitClauseMarkers() { return new R600EmitClauseMarkers(); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -995,7 +995,7 @@ /// LLVM generates byte-addressed pointers. For indirect addressing, we need to /// convert these pointers to a register index. Each register holds /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the -/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// \p StackWidth, which tells us how many of the 4 sub-registers will be used /// for indirect addressing. SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, @@ -1100,7 +1100,7 @@ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); - // TODO: Contrary to the name of the functiom, + // TODO: Contrary to the name of the function, // it also handles sub i32 non-truncating stores (like i1) SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); @@ -1610,7 +1610,7 @@ if (NewBldVec[i].isUndef()) // We mask write here to teach later passes that the ith element of this // vector is undef. Thus we can use it to reduce 128 bits reg usage, - // break false dependencies and additionnaly make assembly easier to read. + // break false dependencies and additionally make assembly easier to read. RemapSwizzle[i] = 7; // SEL_MASK_WRITE if (ConstantFPSDNode *C = dyn_cast(NewBldVec[i])) { if (C->isZero()) { diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -207,7 +207,7 @@ return !ARDef || !ARUse; } - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // isLegalToPruneDependencies - Is it legal to prune dependency between SUI // and SUJ. bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { return false; diff --git a/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/llvm/lib/Target/AMDGPU/R600TargetMachine.h --- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h +++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// The AMDGPU TargetMachine interface definition for hw codgen targets. +/// The AMDGPU TargetMachine interface definition for hw codegen targets. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -241,7 +241,7 @@ } // Check register def/use conflicts, occupancy limits and collect def/use maps. -// Return true if instruction can be bundled with previous. It it cannot +// Return true if instruction can be bundled with previous. If it cannot // def/use maps are not updated. bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1200,7 +1200,7 @@ } } - // Stack slot coloring may assign different objets to the same stack slot. + // Stack slot coloring may assign different objects to the same stack slot. // If not, then the VGPR to AGPR spill slot is dead. for (unsigned FI : SpillFIs.set_bits()) if (!NonVGPRSpillFIs.test(FI)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1574,11 +1574,11 @@ if (Subtarget->hasUnalignedBufferAccessEnabled() && !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS)) { - // If we have an uniform constant load, it still requires using a slow + // If we have a uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so - // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. + // 2-byte alignment is worse than 1 unless doing a 2-byte access. *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? Alignment >= Align(4) : Alignment != Align(2); @@ -4558,7 +4558,7 @@ // Otherwise f32 mad is always full rate and returns the same result as // the separate operations so should be preferred over fma. - // However does not support denomals. + // However does not support denormals. if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); @@ -8400,7 +8400,7 @@ MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); - // If there is a possibilty that flat instruction access scratch memory + // If there is a possibility that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) @@ -8488,7 +8488,7 @@ if (NumElements > 2) return SplitVectorLoad(Op, DAG); - // SI has a hardware bug in the LDS / GDS boounds checking: if the base + // SI has a hardware bug in the LDS / GDS bounds checking: if the base // address is negative, then the instruction is incorrectly treated as // out-of-bounds even if base + offsets is in bounds. Split vectorized // loads here to avoid emitting ds_read2_b32. We may re-combine the @@ -8950,7 +8950,7 @@ MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo(); - // If there is a possibilty that flat instruction access scratch memory + // If there is a possibility that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) @@ -8999,7 +8999,7 @@ if (NumElements > 2) return SplitVectorStore(Op, DAG); - // SI has a hardware bug in the LDS / GDS boounds checking: if the base + // SI has a hardware bug in the LDS / GDS bounds checking: if the base // address is negative, then the instruction is incorrectly treated as // out-of-bounds even if base + offsets is in bounds. Split vectorized // stores here to avoid emitting ds_write2_b32. We may re-combine the @@ -10039,7 +10039,7 @@ } } - // If one half is undef, and one is constant, perfer a splat vector rather + // If one half is undef, and one is constant, prefer a splat vector rather // than the normal qNaN. If it's a register, prefer 0.0 since that's // cheaper to use and may be free with a packed operation. if (NewElts[0].isUndef()) { @@ -10761,7 +10761,7 @@ SDValue RHS = N->getOperand(1); // These should really be instruction patterns, but writing patterns with - // source modiifiers is a pain. + // source modifiers is a pain. // fadd (fadd (a, a), b) -> mad 2.0, a, b if (LHS.getOpcode() == ISD::FADD) { @@ -10858,8 +10858,8 @@ return SDValue(); // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, - // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract - // is sufficient to allow generaing fdot2. + // regardless of the denorm mode setting. Therefore, + // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || (N->getFlags().hasAllowContract() && @@ -11560,7 +11560,7 @@ if (DstSize < InitIdx) return; - // Create a register for the intialization value. + // Create a register for the initialization value. Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); unsigned NewDst = 0; // Final initialized value will be in here @@ -11606,7 +11606,7 @@ TII->legalizeOperandsVOP3(MRI, MI); // Prefer VGPRs over AGPRs in mAI instructions where possible. - // This saves a chain-copy of registers and better ballance register + // This saves a chain-copy of registers and better balance register // use between vgpr and agpr as agpr tuples tend to be big. if (MI.getDesc().OpInfo) { unsigned Opc = MI.getOpcode(); @@ -12451,8 +12451,8 @@ : AtomicExpansionKind::CmpXChg; } - // DS FP atomics do repect the denormal mode, but the rounding mode is fixed - // to round-to-nearest-even. + // DS FP atomics do respect the denormal mode, but the rounding mode is + // fixed to round-to-nearest-even. // The only exception is DS_ADD_F64 which never flushes regardless of mode. if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) { if (!Ty->isDoubleTy()) @@ -12498,7 +12498,7 @@ // always uniform. static bool hasCFUser(const Value *V, SmallPtrSet &Visited, unsigned WaveSize) { - // FIXME: We asssume we never cast the mask results of a control flow + // FIXME: We assume we never cast the mask results of a control flow // intrinsic. // Early exit if the type won't be consistent as a compile time hack. IntegerType *IT = dyn_cast(V->getType()); @@ -12602,7 +12602,7 @@ SDValue N1) const { if (!N0.hasOneUse()) return false; - // Take care of the oportunity to keep N0 uniform + // Take care of the opportunity to keep N0 uniform if (N0->isDivergent() || !N1->isDivergent()) return true; // Check if we have a good chance to form the memory access pattern with the diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1040,7 +1040,7 @@ if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { // The function is going to insert a wait on everything in its prolog. // This still needs to be careful if the call target is a load (e.g. a GOT - // load). We also need to check WAW depenancy with saved PC. + // load). We also need to check WAW dependency with saved PC. Wait = AMDGPU::Waitcnt(); int CallAddrOpIdx = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -203,7 +203,7 @@ if (Offset0Idx == -1 || Offset1Idx == -1) return false; - // XXX - be careful of datalesss loads + // XXX - be careful of dataless loads // getNamedOperandIdx returns the index for MachineInstrs. Since they // include the output in the operand list, but SDNodes don't, we need to // subtract the index by one. @@ -486,7 +486,7 @@ return false; } - // In order to avoid regester pressure, on an average, the number of DWORDS + // In order to avoid register pressure, on an average, the number of DWORDS // loaded together by all clustered mem ops should not exceed 8. This is an // empirical value based on certain observations and performance related // experiments. @@ -2871,7 +2871,7 @@ default: return false; case AMDGPU::S_MOV_B64: - // TODO: We could fold 64-bit immediates, but this get compilicated + // TODO: We could fold 64-bit immediates, but this get complicated // when there are sub-registers. return false; @@ -2951,7 +2951,7 @@ MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_{f16, f32}. - // We should only expect these to be on src0 due to canonicalizations. + // We should only expect these to be on src0 due to canonicalization. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; @@ -4061,9 +4061,9 @@ int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; + const int OpIndices[] = {DstIdx, Src0Idx, Src1Idx, Src2Idx}; - for (int OpIdx: OpIndicies) { + for (int OpIdx : OpIndices) { if (OpIdx == -1) continue; const MachineOperand &MO = MI.getOperand(OpIdx); @@ -4226,7 +4226,7 @@ SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) { - // Implicit uses may safely overlap true overands + // Implicit uses may safely overlap true operands if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { return !RI.regsOverlap(SGPRUsed, SGPR); })) { @@ -4703,7 +4703,7 @@ bool IsAllocatable = false; if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { // vdst and vdata should be both VGPR or AGPR, same for the DS instructions - // with two data operands. Request register class constainted to VGPR only + // with two data operands. Request register class constrained to VGPR only // of both operands present as Machine Copy Propagation can not check this // constraint and possibly other passes too. // @@ -5262,7 +5262,7 @@ const MCInstrDesc &NewDesc = get(NewOpc); Inst.setDesc(NewDesc); - // Callers expect interator to be valid after this call, so modify the + // Callers expect iterator to be valid after this call, so modify the // instruction in place. if (OldVAddrIdx == NewVAddrIdx) { MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); @@ -5271,7 +5271,7 @@ MRI.moveOperands(&NewVAddr, &SAddr, 1); Inst.RemoveOperand(OldSAddrIdx); // Update the use list with the pointer we have just moved from vaddr to - // saddr poisition. Otherwise new vaddr will be missing from the use list. + // saddr position. Otherwise new vaddr will be missing from the use list. MRI.removeRegOperandFromUseList(&NewVAddr); MRI.addRegOperandToUseList(&NewVAddr); } else { @@ -5428,7 +5428,7 @@ else Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); - // Combine the comparision results with AND. + // Combine the comparison results with AND. if (CondReg == AMDGPU::NoRegister) // First. CondReg = NewCondReg; else { // If not the first, we create an AND. @@ -5792,7 +5792,7 @@ if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), RI.getRegClass(RsrcRC))) { // The operands are legal. - // FIXME: We may need to legalize operands besided srsrc. + // FIXME: We may need to legalize operands besides srsrc. return CreatedBB; } @@ -5866,7 +5866,7 @@ MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); - // Atomics rith return have have an additional tied operand and are + // Atomics with return have an additional tied operand and are // missing some of the special bits. MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; @@ -6497,7 +6497,7 @@ // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can // invert either source and then perform the XOR. If either source is a // scalar register, then we can leave the inversion on the scalar unit to - // acheive a better distrubution of scalar and vector instructions. + // achieve a better distribution of scalar and vector instructions. bool Src0IsSGPR = Src0.isReg() && RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); bool Src1IsSGPR = Src1.isReg() && @@ -6719,7 +6719,7 @@ legalizeOperands(*LoHalf, MDT); legalizeOperands(*HiHalf, MDT); - // Move all users of this moved vlaue. + // Move all users of this moved value. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } @@ -6783,7 +6783,7 @@ Worklist.insert(&LoHalf); Worklist.insert(&HiHalf); - // Move all users of this moved vlaue. + // Move all users of this moved value. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } @@ -6861,7 +6861,7 @@ MRI.replaceRegWith(Dest.getReg(), ResultReg); - // We don't need to legalize operands here. src0 for etiher instruction can be + // We don't need to legalize operands here. src0 for either instruction can be // an SGPR, and the second input is unused or determined here. addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } @@ -7075,7 +7075,7 @@ assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); MachineInstr *SCCUseInst = Op.getParent(); - // Look for a preceeding instruction that either defines VCC or SCC. If VCC + // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be // converted to a VALU. @@ -8190,7 +8190,7 @@ const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, this](int64_t ExpectedValue, unsigned SrcSize, - bool IsReversable, bool IsSigned) -> bool { + bool IsReversible, bool IsSigned) -> bool { // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n @@ -8248,7 +8248,7 @@ bool IsReversedCC = false; if (CmpValue != ExpectedValue) { - if (!IsReversable) + if (!IsReversible) return false; IsReversedCC = CmpValue == (ExpectedValue ^ Mask); if (!IsReversedCC) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1963,7 +1963,7 @@ class getAsmSDWA { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), - " vcc", // use vcc token as dst for VOPC instructioins + " vcc", // use vcc token as dst for VOPC instructions "$vdst"), ""); string src0 = "$src0_modifiers"; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1822,7 +1822,7 @@ // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 // as the new-base(anchor) because of the maximum distance which can - // accomodate more intermediate bases presumeably. + // accommodate more intermediate bases presumably. // // Step3: move (&a + 8192) above load1. Compute and promote offsets from // (&a + 8192) for load1, load2, load4. diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -540,7 +540,7 @@ return; // Make sure we do not modify exec between def and use. - // A copy with implcitly defined exec inserted earlier is an exclusion, it + // A copy with implicitly defined exec inserted earlier is an exclusion, it // does not really modify exec. for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) if (I->modifiesRegister(AMDGPU::EXEC, TRI) && @@ -580,7 +580,7 @@ } void SILowerControlFlow::optimizeEndCf() { - // If the only instruction immediately following this END_CF is an another + // If the only instruction immediately following this END_CF is another // END_CF in the only successor we can avoid emitting exec mask restore here. if (!EnableOptimizeEndCf) return; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -320,7 +320,7 @@ SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, SpillFI)); - // Add this register as live-in to all blocks to avoid machine verifer + // Add this register as live-in to all blocks to avoid machine verifier // complaining about use of an undefined physical register. for (MachineBasicBlock &BB : MF) BB.addLiveIn(LaneVGPR); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -64,7 +64,7 @@ // First the instructions are put into blocks. // We want the blocks help control register usage and hide high latencies // later. To help control register usage, we typically want all local -// computations, when for example you create a result that can be comsummed +// computations, when for example you create a result that can be consumed // right away, to be contained in a block. Block inputs and outputs would // typically be important results that are needed in several locations of // the shader. Since we do want blocks to help hide high latencies, we want @@ -90,8 +90,8 @@ // Increasing the number of active wavefronts helps hide the former, but it // doesn't solve the latter, thus why even if wavefront count is high, we have // to try have as many instructions hiding high latencies as possible. -// The OpenCL doc says for example latency of 400 cycles for a global mem access, -// which is hidden by 10 instructions if the wavefront count is 10. +// The OpenCL doc says for example latency of 400 cycles for a global mem +// access, which is hidden by 10 instructions if the wavefront count is 10. // Some figures taken from AMD docs: // Both texture and constant L1 caches are 4-way associative with 64 bytes @@ -353,7 +353,7 @@ // able to correctly handle 5 vs 6, 2 vs 3. // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 - // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7 // The use of findDefBetween removes the case 4. for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { Register Reg = RegMaskPair.RegUnit; @@ -402,7 +402,7 @@ nodeScheduled(SU); } - // TODO: compute InternalAdditionnalPressure. + // TODO: compute InternalAdditionalPressure. InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size()); // Check everything is right. @@ -696,7 +696,7 @@ bool HasSubGraph; std::vector SubGraph; // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary // in the parent graph of SU. #ifndef NDEBUG SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], @@ -1131,7 +1131,7 @@ bool HasSubGraph; std::vector SubGraph; // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary // in the parent graph of SU. #ifndef NDEBUG SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], @@ -1148,7 +1148,7 @@ for (unsigned k : SubGraph) { if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) // Other instructions than EXP would be required in the group. - // Abort the groupping. + // Abort the grouping. return; } } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -63,7 +63,7 @@ }; /// The distinct address spaces supported by the AMDGPU target for -/// atomic memory operation. Can be ORed toether. +/// atomic memory operation. Can be ORed together. enum class SIAtomicAddrSpace { NONE = 0u, GLOBAL = 1u << 0, @@ -943,7 +943,7 @@ case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: // The LDS keeps all memory operations in order for - // the same wavesfront. + // the same wavefront. break; default: llvm_unreachable("Unsupported synchronization scope"); @@ -1547,7 +1547,7 @@ case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: // The LDS keeps all memory operations in order for - // the same wavesfront. + // the same wavefront. break; default: llvm_unreachable("Unsupported synchronization scope"); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -402,7 +402,7 @@ } // If the only user of a logical operation is move to exec, fold it now - // to prevent forming of saveexec. I.e: + // to prevent forming of saveexec. I.e.: // // %0:sreg_64 = COPY $exec // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64 diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -316,7 +316,7 @@ } if (Abs || Neg) { assert(!Sext && - "Float and integer src modifiers can't be set simulteniously"); + "Float and integer src modifiers can't be set simultaneously"); Mods |= Abs ? SISrcMods::ABS : 0u; Mods ^= Neg ? SISrcMods::NEG : 0u; } else if (Sext) { @@ -1131,16 +1131,16 @@ bool Converted = false; for (auto &Operand : SDWAOperands) { LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); - // There should be no intesection between SDWA operands and potential MIs + // There should be no intersection between SDWA operands and potential MIs // e.g.: // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 // v_add_u32 v3, v4, v2 // - // In that example it is possible that we would fold 2nd instruction into 3rd - // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was - // already destroyed). So if SDWAOperand is also a potential MI then do not - // apply it. + // In that example it is possible that we would fold 2nd instruction into + // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that + // was already destroyed). So if SDWAOperand is also a potential MI then do + // not apply it. if (PotentialMatches.count(Operand->getParentInst()) == 0) Converted |= Operand->convertToSDWA(*SDWAInst, TII); } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -133,7 +133,7 @@ return Changed; MaskValue = M->getOperand(1).getImm(); // First if sreg is only used in the AND instruction fold the immediate - // into into the AND. + // into the AND. if (!ReadsSreg && Op2.isKill()) { A->getOperand(2).ChangeToImmediate(MaskValue); M->eraseFromParent(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -97,7 +97,7 @@ []); } -// Generates list of dags for register tupless. +// Generates list of dags for register tuples. class RegSeqDags { dag trunc_rc = (trunc RC, @@ -868,7 +868,7 @@ // ever be allocated using VReg_1. This is a hack for SelectionDAG // that should always be lowered by SILowerI1Copies. TableGen crashes // on an empty register set, but also sorts register classes based on -// the number of registerss in them. Add only one register so this is +// the number of registers in them. Add only one register so this is // sorted to the end and not preferred over VGPR_32. def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> { let Size = 1; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -464,11 +464,11 @@ // Returns next valid instruction pointer if was able to create v_swap_b32. // // This shall not be done too early not to prevent possible folding which may -// remove matched moves, and this should prefereably be done before RA to +// remove matched moves, and this should preferably be done before RA to // release saved registers and also possibly after RA which can insert copies // too. // -// This is really just a generic peephole that is not a canocical shrinking, +// This is really just a generic peephole that is not a canonical shrinking, // although requirements match the pass placement and it reduces code size too. static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, const SIInstrInfo *TII) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -969,7 +969,7 @@ MachineInstr *WQMMaskMI = nullptr; Register LiveMaskWQM; if (IsDemote) { - // Demotes deactive quads with only helper lanes + // Demote - deactivate quads with only helper lanes LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC()); WQMMaskMI = BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg); @@ -977,7 +977,7 @@ .addReg(Exec) .addReg(LiveMaskWQM); } else { - // Kills deactivate lanes + // Kill - deactivate lanes no longer in live mask if (Op.isImm()) { unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -931,7 +931,7 @@ /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); -// Track defaults for fields in the MODE registser. +// Track defaults for fields in the MODE register. struct SIModeRegisterDefaults { /// Floating point opcodes that support exception flag gathering quiet and /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -390,7 +390,7 @@ let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. // We then create two versions of the instruction: with tied dst and src2 - // and with the eralyclobber flag on the dst. This is strciter than the + // and with the earlyclobber flag on the dst. This is stricter than the // actual HW restriction. In particular earlyclobber also affects src0 and // src1 allocation which is not required. bit NoDstOverlap = !gt(DstVT.Size, 128);